| { |
| "best_metric": 0.443807452917099, |
| "best_model_checkpoint": "CTCLLMs_self_tokenizer/checkpoints/LongSpeech_CTC-Shrink_augment_data_self_tokenizer_addMLS_projector_restore/checkpoint-30000", |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 31479, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006353441977191143, |
| "grad_norm": 45.06840896606445, |
| "learning_rate": 3.597883597883598e-06, |
| "loss": 72.1477, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0012706883954382287, |
| "grad_norm": 56.45563507080078, |
| "learning_rate": 7.830687830687831e-06, |
| "loss": 71.8917, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.001906032593157343, |
| "grad_norm": 62.59088897705078, |
| "learning_rate": 1.1851851851851853e-05, |
| "loss": 71.7764, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.0025413767908764573, |
| "grad_norm": 75.64707946777344, |
| "learning_rate": 1.6084656084656086e-05, |
| "loss": 70.9277, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.003176720988595572, |
| "grad_norm": 73.5933837890625, |
| "learning_rate": 2.031746031746032e-05, |
| "loss": 68.0688, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.003812065186314686, |
| "grad_norm": 77.9434814453125, |
| "learning_rate": 2.4550264550264552e-05, |
| "loss": 65.4844, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.004447409384033801, |
| "grad_norm": 81.92144775390625, |
| "learning_rate": 2.8783068783068785e-05, |
| "loss": 61.2486, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.005082753581752915, |
| "grad_norm": 91.82105255126953, |
| "learning_rate": 3.3015873015873014e-05, |
| "loss": 55.9783, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.005718097779472029, |
| "grad_norm": 103.17108917236328, |
| "learning_rate": 3.724867724867725e-05, |
| "loss": 51.7487, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.006353441977191144, |
| "grad_norm": 98.97240447998047, |
| "learning_rate": 4.148148148148148e-05, |
| "loss": 45.0213, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.006988786174910258, |
| "grad_norm": 81.4900894165039, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": 38.3125, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.007624130372629372, |
| "grad_norm": 71.47420501708984, |
| "learning_rate": 4.9947089947089946e-05, |
| "loss": 33.2395, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.008259474570348486, |
| "grad_norm": 63.618309020996094, |
| "learning_rate": 5.417989417989419e-05, |
| "loss": 28.4421, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.008894818768067601, |
| "grad_norm": 58.004974365234375, |
| "learning_rate": 5.841269841269842e-05, |
| "loss": 25.048, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.009530162965786714, |
| "grad_norm": 46.489200592041016, |
| "learning_rate": 6.264550264550265e-05, |
| "loss": 21.9312, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.01016550716350583, |
| "grad_norm": 37.90148162841797, |
| "learning_rate": 6.687830687830688e-05, |
| "loss": 19.0696, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.010800851361224944, |
| "grad_norm": 36.47368240356445, |
| "learning_rate": 7.111111111111112e-05, |
| "loss": 17.0151, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.011436195558944057, |
| "grad_norm": 32.80181884765625, |
| "learning_rate": 7.534391534391536e-05, |
| "loss": 15.5522, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.012071539756663172, |
| "grad_norm": 25.543760299682617, |
| "learning_rate": 7.957671957671958e-05, |
| "loss": 14.1982, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.012706883954382287, |
| "grad_norm": 22.31871223449707, |
| "learning_rate": 8.380952380952382e-05, |
| "loss": 13.2314, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0133422281521014, |
| "grad_norm": 18.374950408935547, |
| "learning_rate": 8.804232804232805e-05, |
| "loss": 12.4637, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.013977572349820515, |
| "grad_norm": 18.497610092163086, |
| "learning_rate": 9.227513227513229e-05, |
| "loss": 11.9765, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.01461291654753963, |
| "grad_norm": 14.529912948608398, |
| "learning_rate": 9.650793650793651e-05, |
| "loss": 11.2678, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.015248260745258743, |
| "grad_norm": 12.937056541442871, |
| "learning_rate": 0.00010074074074074073, |
| "loss": 10.6223, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.015883604942977858, |
| "grad_norm": 12.284934043884277, |
| "learning_rate": 0.00010497354497354497, |
| "loss": 10.189, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.016518949140696973, |
| "grad_norm": 9.824132919311523, |
| "learning_rate": 0.0001092063492063492, |
| "loss": 9.8138, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.017154293338416088, |
| "grad_norm": 8.129488945007324, |
| "learning_rate": 0.00011343915343915343, |
| "loss": 9.4242, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.017789637536135203, |
| "grad_norm": 9.27999496459961, |
| "learning_rate": 0.00011767195767195766, |
| "loss": 9.1365, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.018424981733854314, |
| "grad_norm": 5.250537872314453, |
| "learning_rate": 0.00012190476190476193, |
| "loss": 8.8276, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01906032593157343, |
| "grad_norm": 5.430091381072998, |
| "learning_rate": 0.00012613756613756615, |
| "loss": 8.5892, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.019695670129292544, |
| "grad_norm": 3.3930234909057617, |
| "learning_rate": 0.0001303703703703704, |
| "loss": 8.3652, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.02033101432701166, |
| "grad_norm": 2.841287136077881, |
| "learning_rate": 0.00013460317460317462, |
| "loss": 8.1527, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.020966358524730774, |
| "grad_norm": 2.188707113265991, |
| "learning_rate": 0.00013883597883597885, |
| "loss": 7.9891, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.02160170272244989, |
| "grad_norm": 2.6337716579437256, |
| "learning_rate": 0.0001430687830687831, |
| "loss": 7.8345, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.022237046920169, |
| "grad_norm": 1.7390124797821045, |
| "learning_rate": 0.00014730158730158732, |
| "loss": 7.6817, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.022872391117888115, |
| "grad_norm": 1.6422362327575684, |
| "learning_rate": 0.00015153439153439154, |
| "loss": 7.5748, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.02350773531560723, |
| "grad_norm": 1.6876453161239624, |
| "learning_rate": 0.0001557671957671958, |
| "loss": 7.3896, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.024143079513326345, |
| "grad_norm": 1.230586290359497, |
| "learning_rate": 0.00016, |
| "loss": 7.3337, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.02477842371104546, |
| "grad_norm": 1.2059415578842163, |
| "learning_rate": 0.00016423280423280424, |
| "loss": 7.2545, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.025413767908764574, |
| "grad_norm": 1.5651260614395142, |
| "learning_rate": 0.00016846560846560849, |
| "loss": 7.1927, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02604911210648369, |
| "grad_norm": 2.234393358230591, |
| "learning_rate": 0.0001726984126984127, |
| "loss": 7.1617, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.0266844563042028, |
| "grad_norm": 1.6703732013702393, |
| "learning_rate": 0.00017693121693121696, |
| "loss": 7.093, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.027319800501921915, |
| "grad_norm": 0.796870231628418, |
| "learning_rate": 0.00018116402116402118, |
| "loss": 7.0105, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.02795514469964103, |
| "grad_norm": 1.0919573307037354, |
| "learning_rate": 0.0001853968253968254, |
| "loss": 6.9911, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.028590488897360145, |
| "grad_norm": 1.3225408792495728, |
| "learning_rate": 0.00018962962962962965, |
| "loss": 6.9353, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02922583309507926, |
| "grad_norm": 0.9445711970329285, |
| "learning_rate": 0.00019386243386243388, |
| "loss": 6.9075, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.029861177292798375, |
| "grad_norm": 1.0021796226501465, |
| "learning_rate": 0.0001980952380952381, |
| "loss": 6.8545, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.030496521490517486, |
| "grad_norm": 1.147709608078003, |
| "learning_rate": 0.00019999993595464, |
| "loss": 6.8145, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.0311318656882366, |
| "grad_norm": 1.4438824653625488, |
| "learning_rate": 0.00019999949134260042, |
| "loss": 6.7156, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.031767209885955716, |
| "grad_norm": 1.4000093936920166, |
| "learning_rate": 0.0001999986232924222, |
| "loss": 6.6363, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.031767209885955716, |
| "eval_loss": 6.87591028213501, |
| "eval_runtime": 46.4669, |
| "eval_samples_per_second": 58.17, |
| "eval_steps_per_second": 29.096, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.03240255408367483, |
| "grad_norm": 2.151993989944458, |
| "learning_rate": 0.00019999733180778103, |
| "loss": 6.5176, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.033037898281393946, |
| "grad_norm": 1.611135721206665, |
| "learning_rate": 0.00019999561689414561, |
| "loss": 6.4132, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.03367324247911306, |
| "grad_norm": 2.1010184288024902, |
| "learning_rate": 0.00019999347855877755, |
| "loss": 6.2465, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.034308586676832176, |
| "grad_norm": 1.5021122694015503, |
| "learning_rate": 0.0001999909168107314, |
| "loss": 6.1662, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.03494393087455129, |
| "grad_norm": 1.4672967195510864, |
| "learning_rate": 0.0001999879316608547, |
| "loss": 6.0509, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.035579275072270405, |
| "grad_norm": 1.4146413803100586, |
| "learning_rate": 0.0001999845231217877, |
| "loss": 5.9012, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.03621461926998951, |
| "grad_norm": 1.252382755279541, |
| "learning_rate": 0.00019998069120796358, |
| "loss": 5.815, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.03684996346770863, |
| "grad_norm": 1.6317933797836304, |
| "learning_rate": 0.0001999764359356082, |
| "loss": 5.771, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.03748530766542774, |
| "grad_norm": 1.2354493141174316, |
| "learning_rate": 0.0001999717573227401, |
| "loss": 5.6189, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.03812065186314686, |
| "grad_norm": 1.1442275047302246, |
| "learning_rate": 0.0001999666553891704, |
| "loss": 5.5078, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03875599606086597, |
| "grad_norm": 1.3596833944320679, |
| "learning_rate": 0.0001999611301565027, |
| "loss": 5.4507, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03939134025858509, |
| "grad_norm": 1.5420782566070557, |
| "learning_rate": 0.00019995518164813315, |
| "loss": 5.3225, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.0400266844563042, |
| "grad_norm": 2.335935354232788, |
| "learning_rate": 0.00019994880988925007, |
| "loss": 5.3398, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.04066202865402332, |
| "grad_norm": 1.2030448913574219, |
| "learning_rate": 0.00019994201490683406, |
| "loss": 5.2367, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.04129737285174243, |
| "grad_norm": 1.1881422996520996, |
| "learning_rate": 0.00019993479672965783, |
| "loss": 5.2073, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.04193271704946155, |
| "grad_norm": 1.2961896657943726, |
| "learning_rate": 0.00019992715538828609, |
| "loss": 5.157, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.04256806124718066, |
| "grad_norm": 0.9343932271003723, |
| "learning_rate": 0.00019991909091507525, |
| "loss": 5.0156, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.04320340544489978, |
| "grad_norm": 0.9654686450958252, |
| "learning_rate": 0.00019991060334417364, |
| "loss": 5.054, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.04383874964261889, |
| "grad_norm": 1.4537482261657715, |
| "learning_rate": 0.00019990169271152098, |
| "loss": 4.9824, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.044474093840338, |
| "grad_norm": 1.0155112743377686, |
| "learning_rate": 0.00019989235905484853, |
| "loss": 4.8496, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.045109438038057115, |
| "grad_norm": 0.8903729915618896, |
| "learning_rate": 0.00019988260241367875, |
| "loss": 4.8407, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.04574478223577623, |
| "grad_norm": 1.0020333528518677, |
| "learning_rate": 0.00019987242282932518, |
| "loss": 4.7753, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.046380126433495344, |
| "grad_norm": 1.2074095010757446, |
| "learning_rate": 0.0001998618203448923, |
| "loss": 4.6939, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.04701547063121446, |
| "grad_norm": 2.5281686782836914, |
| "learning_rate": 0.00019985079500527527, |
| "loss": 4.6567, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.047650814828933574, |
| "grad_norm": 1.257580280303955, |
| "learning_rate": 0.00019983934685715982, |
| "loss": 4.5615, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.04828615902665269, |
| "grad_norm": 1.5581581592559814, |
| "learning_rate": 0.00019982747594902203, |
| "loss": 4.6081, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.048921503224371804, |
| "grad_norm": 1.029440999031067, |
| "learning_rate": 0.0001998151823311281, |
| "loss": 4.491, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.04955684742209092, |
| "grad_norm": 0.9729529023170471, |
| "learning_rate": 0.0001998024660555342, |
| "loss": 4.4692, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.050192191619810034, |
| "grad_norm": 1.1230270862579346, |
| "learning_rate": 0.00019978932717608613, |
| "loss": 4.3839, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.05082753581752915, |
| "grad_norm": 1.048663854598999, |
| "learning_rate": 0.0001997757657484192, |
| "loss": 4.3907, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.051462880015248263, |
| "grad_norm": 1.2080233097076416, |
| "learning_rate": 0.000199761781829958, |
| "loss": 4.3147, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.05209822421296738, |
| "grad_norm": 1.1026450395584106, |
| "learning_rate": 0.000199747375479916, |
| "loss": 4.2496, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.052733568410686486, |
| "grad_norm": 1.037937879562378, |
| "learning_rate": 0.00019973254675929554, |
| "loss": 4.2614, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.0533689126084056, |
| "grad_norm": 1.1000276803970337, |
| "learning_rate": 0.00019971729573088742, |
| "loss": 4.1367, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.054004256806124716, |
| "grad_norm": 1.4259387254714966, |
| "learning_rate": 0.0001997016224592706, |
| "loss": 4.1126, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.05463960100384383, |
| "grad_norm": 1.2918739318847656, |
| "learning_rate": 0.00019968552701081203, |
| "loss": 4.0945, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.055274945201562946, |
| "grad_norm": 1.0148296356201172, |
| "learning_rate": 0.00019966900945366634, |
| "loss": 3.9981, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.05591028939928206, |
| "grad_norm": 1.4177788496017456, |
| "learning_rate": 0.0001996520698577755, |
| "loss": 3.9247, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.056545633597001176, |
| "grad_norm": 1.1384249925613403, |
| "learning_rate": 0.00019963470829486858, |
| "loss": 3.9204, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.05718097779472029, |
| "grad_norm": 1.2175607681274414, |
| "learning_rate": 0.0001996169248384615, |
| "loss": 3.9023, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.057816321992439405, |
| "grad_norm": 1.7040660381317139, |
| "learning_rate": 0.0001995987195638565, |
| "loss": 3.8349, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.05845166619015852, |
| "grad_norm": 1.4229464530944824, |
| "learning_rate": 0.0001995800925481421, |
| "loss": 3.7969, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.059087010387877635, |
| "grad_norm": 1.1412523984909058, |
| "learning_rate": 0.0001995610438701925, |
| "loss": 3.6494, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.05972235458559675, |
| "grad_norm": 1.3119606971740723, |
| "learning_rate": 0.00019954157361066764, |
| "loss": 3.6137, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.06035769878331586, |
| "grad_norm": 1.260469675064087, |
| "learning_rate": 0.0001995216818520123, |
| "loss": 3.5703, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.06099304298103497, |
| "grad_norm": 1.6222745180130005, |
| "learning_rate": 0.00019950136867845627, |
| "loss": 3.4526, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.06162838717875409, |
| "grad_norm": 1.399109125137329, |
| "learning_rate": 0.00019948063417601369, |
| "loss": 3.4467, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.0622637313764732, |
| "grad_norm": 1.1804718971252441, |
| "learning_rate": 0.00019945947843248276, |
| "loss": 3.3017, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.06289907557419232, |
| "grad_norm": 1.1146492958068848, |
| "learning_rate": 0.0001994379015374455, |
| "loss": 3.2564, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.06353441977191143, |
| "grad_norm": 1.3201006650924683, |
| "learning_rate": 0.00019941590358226713, |
| "loss": 3.2076, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.06353441977191143, |
| "eval_loss": 3.1886417865753174, |
| "eval_runtime": 45.0925, |
| "eval_samples_per_second": 59.943, |
| "eval_steps_per_second": 29.983, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.06416976396963055, |
| "grad_norm": 1.4352892637252808, |
| "learning_rate": 0.00019939348466009588, |
| "loss": 3.1246, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.06480510816734966, |
| "grad_norm": 1.4391227960586548, |
| "learning_rate": 0.0001993706448658625, |
| "loss": 3.1187, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.06544045236506878, |
| "grad_norm": 1.2951711416244507, |
| "learning_rate": 0.0001993473842962798, |
| "loss": 3.0175, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.06607579656278789, |
| "grad_norm": 1.559552550315857, |
| "learning_rate": 0.00019932370304984255, |
| "loss": 2.8894, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.066711140760507, |
| "grad_norm": 1.2822929620742798, |
| "learning_rate": 0.00019929960122682655, |
| "loss": 2.8483, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.06734648495822612, |
| "grad_norm": 1.4227052927017212, |
| "learning_rate": 0.00019927507892928873, |
| "loss": 2.8691, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.06798182915594524, |
| "grad_norm": 1.643660306930542, |
| "learning_rate": 0.00019925013626106633, |
| "loss": 2.8578, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.06861717335366435, |
| "grad_norm": 1.1360414028167725, |
| "learning_rate": 0.00019922477332777664, |
| "loss": 2.7094, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.06925251755138347, |
| "grad_norm": 1.224853277206421, |
| "learning_rate": 0.00019919899023681658, |
| "loss": 2.6953, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.06988786174910258, |
| "grad_norm": 1.093682885169983, |
| "learning_rate": 0.00019917278709736212, |
| "loss": 2.6255, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.0705232059468217, |
| "grad_norm": 1.238864779472351, |
| "learning_rate": 0.00019914616402036796, |
| "loss": 2.5893, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.07115855014454081, |
| "grad_norm": 1.1016559600830078, |
| "learning_rate": 0.00019911912111856688, |
| "loss": 2.4743, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.07179389434225993, |
| "grad_norm": 1.12881600856781, |
| "learning_rate": 0.00019909165850646941, |
| "loss": 2.5057, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.07242923853997903, |
| "grad_norm": 1.216238021850586, |
| "learning_rate": 0.00019906377630036338, |
| "loss": 2.4624, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.07306458273769814, |
| "grad_norm": 1.1429589986801147, |
| "learning_rate": 0.00019903547461831323, |
| "loss": 2.3835, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.07369992693541726, |
| "grad_norm": 0.9367678165435791, |
| "learning_rate": 0.00019900675358015967, |
| "loss": 2.3971, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.07433527113313637, |
| "grad_norm": 1.0869677066802979, |
| "learning_rate": 0.00019897761330751922, |
| "loss": 2.3241, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.07497061533085549, |
| "grad_norm": 0.958840548992157, |
| "learning_rate": 0.0001989480539237835, |
| "loss": 2.2828, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.0756059595285746, |
| "grad_norm": 0.9724891781806946, |
| "learning_rate": 0.00019891807555411884, |
| "loss": 2.2858, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.07624130372629372, |
| "grad_norm": 1.045828104019165, |
| "learning_rate": 0.00019888767832546572, |
| "loss": 2.2949, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.07687664792401283, |
| "grad_norm": 1.0283712148666382, |
| "learning_rate": 0.0001988568623665383, |
| "loss": 2.2034, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.07751199212173195, |
| "grad_norm": 1.0930371284484863, |
| "learning_rate": 0.00019882562780782376, |
| "loss": 2.2283, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.07814733631945106, |
| "grad_norm": 0.892132580280304, |
| "learning_rate": 0.00019879397478158177, |
| "loss": 2.1872, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.07878268051717018, |
| "grad_norm": 1.0107035636901855, |
| "learning_rate": 0.00019876190342184402, |
| "loss": 2.1874, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.07941802471488929, |
| "grad_norm": 1.1195555925369263, |
| "learning_rate": 0.00019872941386441358, |
| "loss": 2.0823, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.0800533689126084, |
| "grad_norm": 1.2803888320922852, |
| "learning_rate": 0.0001986965062468643, |
| "loss": 2.0905, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.08068871311032752, |
| "grad_norm": 1.0955703258514404, |
| "learning_rate": 0.00019866318070854033, |
| "loss": 2.0645, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.08132405730804664, |
| "grad_norm": 1.117477297782898, |
| "learning_rate": 0.00019862943739055536, |
| "loss": 2.0259, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.08195940150576575, |
| "grad_norm": 0.9660820960998535, |
| "learning_rate": 0.0001985952764357923, |
| "loss": 1.9881, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.08259474570348486, |
| "grad_norm": 0.9186820983886719, |
| "learning_rate": 0.0001985606979889023, |
| "loss": 1.9571, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.08323008990120398, |
| "grad_norm": 1.1236801147460938, |
| "learning_rate": 0.00019852570219630445, |
| "loss": 1.9506, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.0838654340989231, |
| "grad_norm": 0.9719575047492981, |
| "learning_rate": 0.0001984902892061851, |
| "loss": 1.9359, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.08450077829664221, |
| "grad_norm": 1.3401118516921997, |
| "learning_rate": 0.00019845445916849704, |
| "loss": 1.9707, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.08513612249436132, |
| "grad_norm": 0.980446457862854, |
| "learning_rate": 0.00019841821223495916, |
| "loss": 1.88, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.08577146669208044, |
| "grad_norm": 1.178143858909607, |
| "learning_rate": 0.00019838154855905552, |
| "loss": 1.8629, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.08640681088979955, |
| "grad_norm": 0.9232170581817627, |
| "learning_rate": 0.00019834446829603494, |
| "loss": 1.8467, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.08704215508751867, |
| "grad_norm": 1.7343891859054565, |
| "learning_rate": 0.00019830697160291017, |
| "loss": 1.8194, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.08767749928523778, |
| "grad_norm": 0.878983199596405, |
| "learning_rate": 0.0001982690586384573, |
| "loss": 1.8232, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.0883128434829569, |
| "grad_norm": 1.0917317867279053, |
| "learning_rate": 0.00019823072956321513, |
| "loss": 1.7668, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.088948187680676, |
| "grad_norm": 1.0753387212753296, |
| "learning_rate": 0.00019819198453948443, |
| "loss": 1.7968, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.08958353187839511, |
| "grad_norm": 1.0904388427734375, |
| "learning_rate": 0.00019815282373132718, |
| "loss": 1.7834, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.09021887607611423, |
| "grad_norm": 0.9622576236724854, |
| "learning_rate": 0.00019811324730456607, |
| "loss": 1.7773, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.09085422027383334, |
| "grad_norm": 0.8677240610122681, |
| "learning_rate": 0.0001980732554267836, |
| "loss": 1.7322, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.09148956447155246, |
| "grad_norm": 1.0953987836837769, |
| "learning_rate": 0.0001980328482673215, |
| "loss": 1.7123, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.09212490866927157, |
| "grad_norm": 1.0277127027511597, |
| "learning_rate": 0.00019799202599727998, |
| "loss": 1.7558, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.09276025286699069, |
| "grad_norm": 1.1174383163452148, |
| "learning_rate": 0.000197950788789517, |
| "loss": 1.7222, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.0933955970647098, |
| "grad_norm": 0.9651451706886292, |
| "learning_rate": 0.00019790913681864747, |
| "loss": 1.6652, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.09403094126242892, |
| "grad_norm": 0.9669461250305176, |
| "learning_rate": 0.00019786707026104265, |
| "loss": 1.6381, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.09466628546014803, |
| "grad_norm": 0.9406834244728088, |
| "learning_rate": 0.0001978245892948293, |
| "loss": 1.6276, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.09530162965786715, |
| "grad_norm": 0.9768303632736206, |
| "learning_rate": 0.0001977816940998889, |
| "loss": 1.6071, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.09530162965786715, |
| "eval_loss": 1.5878759622573853, |
| "eval_runtime": 45.9166, |
| "eval_samples_per_second": 58.868, |
| "eval_steps_per_second": 29.445, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.09593697385558626, |
| "grad_norm": 1.083208441734314, |
| "learning_rate": 0.00019773838485785702, |
| "loss": 1.6341, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.09657231805330538, |
| "grad_norm": 0.9333330988883972, |
| "learning_rate": 0.00019769466175212244, |
| "loss": 1.5931, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.0972076622510245, |
| "grad_norm": 0.9718533754348755, |
| "learning_rate": 0.00019765052496782638, |
| "loss": 1.5735, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.09784300644874361, |
| "grad_norm": 1.2169800996780396, |
| "learning_rate": 0.00019760597469186184, |
| "loss": 1.5507, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.09847835064646272, |
| "grad_norm": 0.9822967648506165, |
| "learning_rate": 0.00019756101111287257, |
| "loss": 1.5784, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.09911369484418184, |
| "grad_norm": 0.9830970168113708, |
| "learning_rate": 0.0001975156344212525, |
| "loss": 1.5473, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.09974903904190095, |
| "grad_norm": 0.8926035761833191, |
| "learning_rate": 0.00019746984480914484, |
| "loss": 1.5141, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.10038438323962007, |
| "grad_norm": 0.8814927339553833, |
| "learning_rate": 0.00019742364247044125, |
| "loss": 1.5164, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.10101972743733918, |
| "grad_norm": 0.8626115322113037, |
| "learning_rate": 0.00019737702760078105, |
| "loss": 1.4495, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.1016550716350583, |
| "grad_norm": 1.0857669115066528, |
| "learning_rate": 0.00019733000039755036, |
| "loss": 1.511, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.10229041583277741, |
| "grad_norm": 0.8834457397460938, |
| "learning_rate": 0.00019728256105988132, |
| "loss": 1.4764, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.10292576003049653, |
| "grad_norm": 0.8241048455238342, |
| "learning_rate": 0.00019723470978865118, |
| "loss": 1.4253, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.10356110422821564, |
| "grad_norm": 0.9844352006912231, |
| "learning_rate": 0.00019718644678648158, |
| "loss": 1.4595, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.10419644842593476, |
| "grad_norm": 0.8982945084571838, |
| "learning_rate": 0.00019713777225773745, |
| "loss": 1.3535, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.10483179262365386, |
| "grad_norm": 1.2204469442367554, |
| "learning_rate": 0.0001970886864085263, |
| "loss": 1.4283, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.10546713682137297, |
| "grad_norm": 1.0676652193069458, |
| "learning_rate": 0.00019703918944669754, |
| "loss": 1.3858, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.10610248101909209, |
| "grad_norm": 1.17191743850708, |
| "learning_rate": 0.00019698928158184116, |
| "loss": 1.4426, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.1067378252168112, |
| "grad_norm": 0.9601316452026367, |
| "learning_rate": 0.00019693896302528716, |
| "loss": 1.3621, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.10737316941453032, |
| "grad_norm": 0.9755037426948547, |
| "learning_rate": 0.00019688823399010463, |
| "loss": 1.3901, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.10800851361224943, |
| "grad_norm": 1.0111849308013916, |
| "learning_rate": 0.0001968370946911007, |
| "loss": 1.3748, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.10864385780996855, |
| "grad_norm": 0.8471179604530334, |
| "learning_rate": 0.00019678554534481978, |
| "loss": 1.3227, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.10927920200768766, |
| "grad_norm": 0.9206441640853882, |
| "learning_rate": 0.0001967335861695426, |
| "loss": 1.3493, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.10991454620540678, |
| "grad_norm": 1.055109977722168, |
| "learning_rate": 0.0001966812173852852, |
| "loss": 1.3549, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.11054989040312589, |
| "grad_norm": 0.995614767074585, |
| "learning_rate": 0.00019662843921379816, |
| "loss": 1.3468, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.111185234600845, |
| "grad_norm": 0.8873100876808167, |
| "learning_rate": 0.0001965752518785655, |
| "loss": 1.3129, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.11182057879856412, |
| "grad_norm": 0.9802286624908447, |
| "learning_rate": 0.00019652165560480383, |
| "loss": 1.34, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.11245592299628324, |
| "grad_norm": 0.9177120923995972, |
| "learning_rate": 0.00019646765061946133, |
| "loss": 1.3321, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.11309126719400235, |
| "grad_norm": 1.0982646942138672, |
| "learning_rate": 0.00019641323715121692, |
| "loss": 1.292, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.11372661139172147, |
| "grad_norm": 1.1567240953445435, |
| "learning_rate": 0.00019635841543047918, |
| "loss": 1.3052, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.11436195558944058, |
| "grad_norm": 0.8516421914100647, |
| "learning_rate": 0.00019630318568938528, |
| "loss": 1.3189, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1149972997871597, |
| "grad_norm": 0.9710924029350281, |
| "learning_rate": 0.00019624754816180022, |
| "loss": 1.2644, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.11563264398487881, |
| "grad_norm": 0.9252649545669556, |
| "learning_rate": 0.00019619150308331572, |
| "loss": 1.2517, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.11626798818259793, |
| "grad_norm": 0.973948061466217, |
| "learning_rate": 0.0001961350506912493, |
| "loss": 1.2651, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.11690333238031704, |
| "grad_norm": 0.9078177213668823, |
| "learning_rate": 0.000196078191224643, |
| "loss": 1.2089, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.11753867657803616, |
| "grad_norm": 0.8456325531005859, |
| "learning_rate": 0.0001960209249242628, |
| "loss": 1.2503, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.11817402077575527, |
| "grad_norm": 1.2014869451522827, |
| "learning_rate": 0.00019596325203259722, |
| "loss": 1.2287, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.11880936497347439, |
| "grad_norm": 0.903296172618866, |
| "learning_rate": 0.0001959051727938566, |
| "loss": 1.1999, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.1194447091711935, |
| "grad_norm": 0.9159349799156189, |
| "learning_rate": 0.00019584668745397182, |
| "loss": 1.2077, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.12008005336891261, |
| "grad_norm": 1.0457518100738525, |
| "learning_rate": 0.00019578779626059332, |
| "loss": 1.2395, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.12071539756663172, |
| "grad_norm": 0.8328551650047302, |
| "learning_rate": 0.0001957284994630902, |
| "loss": 1.2039, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.12135074176435083, |
| "grad_norm": 0.9112881422042847, |
| "learning_rate": 0.00019566879731254902, |
| "loss": 1.1987, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.12198608596206995, |
| "grad_norm": 2.0256752967834473, |
| "learning_rate": 0.00019560869006177262, |
| "loss": 1.1923, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.12262143015978906, |
| "grad_norm": 0.9714537262916565, |
| "learning_rate": 0.00019554817796527943, |
| "loss": 1.1726, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.12325677435750818, |
| "grad_norm": 0.8522310256958008, |
| "learning_rate": 0.00019548726127930198, |
| "loss": 1.1985, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.12389211855522729, |
| "grad_norm": 0.8728988766670227, |
| "learning_rate": 0.00019542594026178612, |
| "loss": 1.1662, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.1245274627529464, |
| "grad_norm": 0.9155168533325195, |
| "learning_rate": 0.00019536421517238973, |
| "loss": 1.1529, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.12516280695066553, |
| "grad_norm": 1.05704665184021, |
| "learning_rate": 0.0001953020862724817, |
| "loss": 1.1415, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.12579815114838463, |
| "grad_norm": 0.7793872952461243, |
| "learning_rate": 0.0001952395538251408, |
| "loss": 1.1387, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.12643349534610376, |
| "grad_norm": 0.9358331561088562, |
| "learning_rate": 0.00019517661809515465, |
| "loss": 1.1816, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.12706883954382286, |
| "grad_norm": 0.8175097107887268, |
| "learning_rate": 0.00019511327934901846, |
| "loss": 1.126, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.12706883954382286, |
| "eval_loss": 1.1081569194793701, |
| "eval_runtime": 128.6977, |
| "eval_samples_per_second": 21.003, |
| "eval_steps_per_second": 10.505, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.127704183741542, |
| "grad_norm": 0.9568232893943787, |
| "learning_rate": 0.000195049537854934, |
| "loss": 1.1002, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.1283395279392611, |
| "grad_norm": 0.9011651277542114, |
| "learning_rate": 0.00019498539388280848, |
| "loss": 1.129, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.1289748721369802, |
| "grad_norm": 1.045811653137207, |
| "learning_rate": 0.00019492084770425327, |
| "loss": 1.0945, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.12961021633469932, |
| "grad_norm": 0.8668608069419861, |
| "learning_rate": 0.00019485589959258292, |
| "loss": 1.0601, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.13024556053241843, |
| "grad_norm": 0.9976728558540344, |
| "learning_rate": 0.00019479054982281393, |
| "loss": 1.1127, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.13088090473013755, |
| "grad_norm": 0.9135074019432068, |
| "learning_rate": 0.00019472479867166354, |
| "loss": 1.0708, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.13151624892785665, |
| "grad_norm": 0.8302998542785645, |
| "learning_rate": 0.0001946586464175486, |
| "loss": 1.0925, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.13215159312557578, |
| "grad_norm": 0.9594709277153015, |
| "learning_rate": 0.0001945920933405844, |
| "loss": 1.0879, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.13278693732329488, |
| "grad_norm": 1.3145122528076172, |
| "learning_rate": 0.00019452513972258352, |
| "loss": 1.0706, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.133422281521014, |
| "grad_norm": 1.0521440505981445, |
| "learning_rate": 0.00019445778584705452, |
| "loss": 1.1089, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.13405762571873311, |
| "grad_norm": 1.1046104431152344, |
| "learning_rate": 0.00019439003199920088, |
| "loss": 1.0965, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.13469296991645224, |
| "grad_norm": 1.1228617429733276, |
| "learning_rate": 0.00019432187846591967, |
| "loss": 1.0747, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.13532831411417134, |
| "grad_norm": 0.8399156332015991, |
| "learning_rate": 0.00019425332553580044, |
| "loss": 1.0239, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.13596365831189047, |
| "grad_norm": 0.9118017554283142, |
| "learning_rate": 0.00019418437349912385, |
| "loss": 1.0557, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.13659900250960957, |
| "grad_norm": 1.1154282093048096, |
| "learning_rate": 0.00019411502264786069, |
| "loss": 1.0846, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.1372343467073287, |
| "grad_norm": 0.8457648158073425, |
| "learning_rate": 0.00019404527327567035, |
| "loss": 1.0438, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.1378696909050478, |
| "grad_norm": 0.9336498975753784, |
| "learning_rate": 0.0001939751256778998, |
| "loss": 1.0403, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.13850503510276693, |
| "grad_norm": 0.9318077564239502, |
| "learning_rate": 0.0001939045801515822, |
| "loss": 1.0375, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.13914037930048603, |
| "grad_norm": 0.9146689176559448, |
| "learning_rate": 0.0001938336369954358, |
| "loss": 1.0394, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.13977572349820516, |
| "grad_norm": 1.2244622707366943, |
| "learning_rate": 0.00019376229650986245, |
| "loss": 1.0305, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.14041106769592426, |
| "grad_norm": 0.9721834659576416, |
| "learning_rate": 0.00019369055899694652, |
| "loss": 1.0133, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.1410464118936434, |
| "grad_norm": 0.8538774251937866, |
| "learning_rate": 0.00019361842476045356, |
| "loss": 1.0272, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.1416817560913625, |
| "grad_norm": 0.7733943462371826, |
| "learning_rate": 0.000193545894105829, |
| "loss": 1.0328, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.14231710028908162, |
| "grad_norm": 1.0937755107879639, |
| "learning_rate": 0.00019347296734019683, |
| "loss": 1.0501, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.14295244448680072, |
| "grad_norm": 0.8855345845222473, |
| "learning_rate": 0.00019339964477235836, |
| "loss": 0.9979, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.14358778868451985, |
| "grad_norm": 0.9113184213638306, |
| "learning_rate": 0.0001933259267127909, |
| "loss": 0.967, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.14422313288223895, |
| "grad_norm": 0.9671328663825989, |
| "learning_rate": 0.00019325181347364643, |
| "loss": 1.016, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.14485847707995805, |
| "grad_norm": 0.8655368685722351, |
| "learning_rate": 0.00019317730536875022, |
| "loss": 1.0005, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.14549382127767718, |
| "grad_norm": 0.8673165440559387, |
| "learning_rate": 0.00019310240271359967, |
| "loss": 0.9697, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.14612916547539628, |
| "grad_norm": 1.0993086099624634, |
| "learning_rate": 0.00019302710582536276, |
| "loss": 0.9832, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.1467645096731154, |
| "grad_norm": 1.1561827659606934, |
| "learning_rate": 0.00019295141502287687, |
| "loss": 0.9603, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.1473998538708345, |
| "grad_norm": 1.0052567720413208, |
| "learning_rate": 0.00019287533062664733, |
| "loss": 0.9808, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.14803519806855364, |
| "grad_norm": 0.9202858209609985, |
| "learning_rate": 0.00019279885295884618, |
| "loss": 0.9564, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.14867054226627274, |
| "grad_norm": 0.8606549501419067, |
| "learning_rate": 0.0001927219823433106, |
| "loss": 0.9936, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.14930588646399187, |
| "grad_norm": 0.9188569784164429, |
| "learning_rate": 0.00019264471910554183, |
| "loss": 0.9833, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.14994123066171097, |
| "grad_norm": 0.7773941159248352, |
| "learning_rate": 0.0001925670635727035, |
| "loss": 0.9272, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.1505765748594301, |
| "grad_norm": 0.8689327836036682, |
| "learning_rate": 0.00019248901607362047, |
| "loss": 0.9462, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.1512119190571492, |
| "grad_norm": 0.800255298614502, |
| "learning_rate": 0.00019241057693877725, |
| "loss": 0.9222, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.15184726325486833, |
| "grad_norm": 0.9326597452163696, |
| "learning_rate": 0.0001923317465003168, |
| "loss": 0.961, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.15248260745258743, |
| "grad_norm": 1.072416067123413, |
| "learning_rate": 0.00019225252509203888, |
| "loss": 0.9464, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.15311795165030656, |
| "grad_norm": 0.9187152981758118, |
| "learning_rate": 0.0001921729130493989, |
| "loss": 0.9461, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.15375329584802566, |
| "grad_norm": 0.8737976551055908, |
| "learning_rate": 0.00019209291070950633, |
| "loss": 0.8771, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.1543886400457448, |
| "grad_norm": 0.9321054220199585, |
| "learning_rate": 0.0001920125184111233, |
| "loss": 0.9179, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.1550239842434639, |
| "grad_norm": 0.7673978209495544, |
| "learning_rate": 0.00019193173649466322, |
| "loss": 0.8711, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.15565932844118302, |
| "grad_norm": 1.0326552391052246, |
| "learning_rate": 0.00019185056530218923, |
| "loss": 0.9494, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.15629467263890212, |
| "grad_norm": 0.8184536695480347, |
| "learning_rate": 0.0001917690051774129, |
| "loss": 0.9201, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.15693001683662125, |
| "grad_norm": 0.8319898247718811, |
| "learning_rate": 0.0001916870564656926, |
| "loss": 0.9167, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.15756536103434035, |
| "grad_norm": 1.0563160181045532, |
| "learning_rate": 0.0001916047195140323, |
| "loss": 0.8993, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.15820070523205948, |
| "grad_norm": 0.8466194868087769, |
| "learning_rate": 0.00019152199467107974, |
| "loss": 0.9198, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.15883604942977858, |
| "grad_norm": 1.1115593910217285, |
| "learning_rate": 0.00019143888228712527, |
| "loss": 0.8749, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.15883604942977858, |
| "eval_loss": 0.8843944668769836, |
| "eval_runtime": 127.8707, |
| "eval_samples_per_second": 21.139, |
| "eval_steps_per_second": 10.573, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.1594713936274977, |
| "grad_norm": 0.9679493308067322, |
| "learning_rate": 0.00019135538271410022, |
| "loss": 0.9212, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.1601067378252168, |
| "grad_norm": 0.8485816121101379, |
| "learning_rate": 0.0001912714963055754, |
| "loss": 0.9054, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.16074208202293594, |
| "grad_norm": 1.0210843086242676, |
| "learning_rate": 0.0001911872234167597, |
| "loss": 0.917, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.16137742622065504, |
| "grad_norm": 1.0072481632232666, |
| "learning_rate": 0.00019110256440449844, |
| "loss": 0.9014, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.16201277041837414, |
| "grad_norm": 0.9833612442016602, |
| "learning_rate": 0.00019101751962727204, |
| "loss": 0.891, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.16264811461609327, |
| "grad_norm": 1.0564861297607422, |
| "learning_rate": 0.0001909320894451943, |
| "loss": 0.8581, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.16328345881381237, |
| "grad_norm": 1.1205075979232788, |
| "learning_rate": 0.0001908462742200111, |
| "loss": 0.8884, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.1639188030115315, |
| "grad_norm": 0.9841699004173279, |
| "learning_rate": 0.0001907600743150986, |
| "loss": 0.8815, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.1645541472092506, |
| "grad_norm": 0.852820098400116, |
| "learning_rate": 0.00019067349009546197, |
| "loss": 0.8594, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.16518949140696973, |
| "grad_norm": 0.8630360960960388, |
| "learning_rate": 0.00019058652192773372, |
| "loss": 0.8653, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.16582483560468883, |
| "grad_norm": 1.0112591981887817, |
| "learning_rate": 0.00019049917018017207, |
| "loss": 0.8715, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.16646017980240796, |
| "grad_norm": 0.9182717204093933, |
| "learning_rate": 0.00019041143522265948, |
| "loss": 0.8875, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.16709552400012706, |
| "grad_norm": 1.190596103668213, |
| "learning_rate": 0.0001903233174267012, |
| "loss": 0.9027, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.1677308681978462, |
| "grad_norm": 0.8345910310745239, |
| "learning_rate": 0.00019023481716542342, |
| "loss": 0.8819, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.1683662123955653, |
| "grad_norm": 0.8964826464653015, |
| "learning_rate": 0.00019014593481357192, |
| "loss": 0.845, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.16900155659328442, |
| "grad_norm": 1.1423965692520142, |
| "learning_rate": 0.0001900566707475104, |
| "loss": 0.8463, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.16963690079100352, |
| "grad_norm": 0.895899772644043, |
| "learning_rate": 0.00018996702534521888, |
| "loss": 0.8631, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.17027224498872265, |
| "grad_norm": 1.0254230499267578, |
| "learning_rate": 0.00018987699898629208, |
| "loss": 0.8489, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.17090758918644175, |
| "grad_norm": 0.9370276927947998, |
| "learning_rate": 0.00018978659205193794, |
| "loss": 0.8822, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.17154293338416088, |
| "grad_norm": 1.1030024290084839, |
| "learning_rate": 0.00018969580492497577, |
| "loss": 0.8834, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.17217827758187998, |
| "grad_norm": 0.9148856997489929, |
| "learning_rate": 0.00018960463798983494, |
| "loss": 0.8198, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.1728136217795991, |
| "grad_norm": 0.8851357102394104, |
| "learning_rate": 0.00018951309163255288, |
| "loss": 0.8077, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.1734489659773182, |
| "grad_norm": 0.9701651334762573, |
| "learning_rate": 0.00018942116624077386, |
| "loss": 0.8687, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.17408431017503734, |
| "grad_norm": 0.9508700966835022, |
| "learning_rate": 0.00018932886220374696, |
| "loss": 0.8764, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.17471965437275644, |
| "grad_norm": 0.9914870858192444, |
| "learning_rate": 0.00018923617991232466, |
| "loss": 0.8157, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.17535499857047557, |
| "grad_norm": 1.010511040687561, |
| "learning_rate": 0.00018914311975896117, |
| "loss": 0.839, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.17599034276819467, |
| "grad_norm": 0.8063015937805176, |
| "learning_rate": 0.00018904968213771065, |
| "loss": 0.8308, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.1766256869659138, |
| "grad_norm": 0.8653827905654907, |
| "learning_rate": 0.00018895586744422564, |
| "loss": 0.8304, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.1772610311636329, |
| "grad_norm": 1.0596357583999634, |
| "learning_rate": 0.00018886167607575532, |
| "loss": 0.8346, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.177896375361352, |
| "grad_norm": 1.0251786708831787, |
| "learning_rate": 0.00018876710843114398, |
| "loss": 0.8639, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.17853171955907113, |
| "grad_norm": 0.8897235989570618, |
| "learning_rate": 0.00018867216491082905, |
| "loss": 0.8286, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.17916706375679023, |
| "grad_norm": 0.8118072748184204, |
| "learning_rate": 0.00018857684591683967, |
| "loss": 0.8597, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.17980240795450936, |
| "grad_norm": 0.8698698878288269, |
| "learning_rate": 0.0001884811518527949, |
| "loss": 0.7894, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.18043775215222846, |
| "grad_norm": 0.8228470087051392, |
| "learning_rate": 0.00018838508312390192, |
| "loss": 0.8302, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.1810730963499476, |
| "grad_norm": 1.1411319971084595, |
| "learning_rate": 0.00018828864013695448, |
| "loss": 0.8313, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.1817084405476667, |
| "grad_norm": 0.8076447248458862, |
| "learning_rate": 0.00018819182330033103, |
| "loss": 0.798, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.18234378474538582, |
| "grad_norm": 0.8669622540473938, |
| "learning_rate": 0.00018809463302399304, |
| "loss": 0.7911, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.18297912894310492, |
| "grad_norm": 0.8435181975364685, |
| "learning_rate": 0.0001879970697194833, |
| "loss": 0.7951, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.18361447314082405, |
| "grad_norm": 1.1023324728012085, |
| "learning_rate": 0.00018789913379992418, |
| "loss": 0.8253, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.18424981733854315, |
| "grad_norm": 0.9319256544113159, |
| "learning_rate": 0.00018780082568001585, |
| "loss": 0.7625, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.18488516153626228, |
| "grad_norm": 0.8259923458099365, |
| "learning_rate": 0.00018770214577603443, |
| "loss": 0.8079, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.18552050573398138, |
| "grad_norm": 0.8953514695167542, |
| "learning_rate": 0.00018760309450583043, |
| "loss": 0.7647, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.1861558499317005, |
| "grad_norm": 0.8347587585449219, |
| "learning_rate": 0.00018750367228882685, |
| "loss": 0.8089, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.1867911941294196, |
| "grad_norm": 0.9788545966148376, |
| "learning_rate": 0.00018740387954601742, |
| "loss": 0.7737, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.18742653832713874, |
| "grad_norm": 0.9509750008583069, |
| "learning_rate": 0.00018730371669996478, |
| "loss": 0.8073, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.18806188252485784, |
| "grad_norm": 0.9388551115989685, |
| "learning_rate": 0.0001872031841747988, |
| "loss": 0.7585, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.18869722672257697, |
| "grad_norm": 0.8342726826667786, |
| "learning_rate": 0.00018710228239621476, |
| "loss": 0.8025, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.18933257092029607, |
| "grad_norm": 1.0455151796340942, |
| "learning_rate": 0.00018700101179147134, |
| "loss": 0.7603, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.1899679151180152, |
| "grad_norm": 0.820931077003479, |
| "learning_rate": 0.00018689937278938915, |
| "loss": 0.7972, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.1906032593157343, |
| "grad_norm": 0.8494334816932678, |
| "learning_rate": 0.00018679736582034867, |
| "loss": 0.7663, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.1906032593157343, |
| "eval_loss": 0.7605160474777222, |
| "eval_runtime": 45.0866, |
| "eval_samples_per_second": 59.951, |
| "eval_steps_per_second": 29.987, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.19123860351345343, |
| "grad_norm": 0.9915199279785156, |
| "learning_rate": 0.00018669499131628847, |
| "loss": 0.7911, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.19187394771117253, |
| "grad_norm": 1.009752869606018, |
| "learning_rate": 0.00018659739550293418, |
| "loss": 0.7791, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.19250929190889166, |
| "grad_norm": 1.008296012878418, |
| "learning_rate": 0.00018649430555384115, |
| "loss": 0.7741, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.19314463610661076, |
| "grad_norm": 0.9730678200721741, |
| "learning_rate": 0.0001863908493530077, |
| "loss": 0.8028, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.19377998030432986, |
| "grad_norm": 0.8386117815971375, |
| "learning_rate": 0.0001862870273385091, |
| "loss": 0.789, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.194415324502049, |
| "grad_norm": 0.8517867922782898, |
| "learning_rate": 0.00018618283994996954, |
| "loss": 0.7472, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.1950506686997681, |
| "grad_norm": 0.8791770339012146, |
| "learning_rate": 0.00018607828762856046, |
| "loss": 0.7871, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.19568601289748722, |
| "grad_norm": 0.9248822331428528, |
| "learning_rate": 0.00018597337081699848, |
| "loss": 0.762, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.19632135709520632, |
| "grad_norm": 0.8059686422348022, |
| "learning_rate": 0.00018586808995954367, |
| "loss": 0.7345, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.19695670129292545, |
| "grad_norm": 0.7610188126564026, |
| "learning_rate": 0.00018576244550199758, |
| "loss": 0.7478, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.19759204549064455, |
| "grad_norm": 0.7763079404830933, |
| "learning_rate": 0.00018565643789170144, |
| "loss": 0.7552, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.19822738968836368, |
| "grad_norm": 1.1734811067581177, |
| "learning_rate": 0.00018555006757753418, |
| "loss": 0.7645, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.19886273388608278, |
| "grad_norm": 0.7641186714172363, |
| "learning_rate": 0.00018544333500991053, |
| "loss": 0.7267, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.1994980780838019, |
| "grad_norm": 0.8322380781173706, |
| "learning_rate": 0.00018533624064077922, |
| "loss": 0.7601, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.200133422281521, |
| "grad_norm": 0.9059064388275146, |
| "learning_rate": 0.00018522878492362096, |
| "loss": 0.7716, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.20076876647924013, |
| "grad_norm": 0.7728195786476135, |
| "learning_rate": 0.00018512096831344653, |
| "loss": 0.7435, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.20140411067695924, |
| "grad_norm": 0.9880885481834412, |
| "learning_rate": 0.00018501279126679495, |
| "loss": 0.7378, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.20203945487467836, |
| "grad_norm": 0.8192346096038818, |
| "learning_rate": 0.00018490425424173138, |
| "loss": 0.7376, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.20267479907239747, |
| "grad_norm": 1.175627589225769, |
| "learning_rate": 0.0001847953576978453, |
| "loss": 0.7672, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.2033101432701166, |
| "grad_norm": 0.7959802746772766, |
| "learning_rate": 0.0001846861020962486, |
| "loss": 0.7331, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.2039454874678357, |
| "grad_norm": 0.8343777060508728, |
| "learning_rate": 0.0001845764878995735, |
| "loss": 0.7142, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.20458083166555482, |
| "grad_norm": 0.9900172352790833, |
| "learning_rate": 0.00018446651557197066, |
| "loss": 0.7819, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.20521617586327393, |
| "grad_norm": 1.111018180847168, |
| "learning_rate": 0.00018435618557910725, |
| "loss": 0.7226, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.20585152006099305, |
| "grad_norm": 0.9301121830940247, |
| "learning_rate": 0.00018424549838816492, |
| "loss": 0.7295, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.20648686425871215, |
| "grad_norm": 0.894797146320343, |
| "learning_rate": 0.0001841344544678378, |
| "loss": 0.7199, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.20712220845643128, |
| "grad_norm": 1.041779637336731, |
| "learning_rate": 0.0001840230542883306, |
| "loss": 0.7213, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.20775755265415038, |
| "grad_norm": 0.9267428517341614, |
| "learning_rate": 0.00018391129832135659, |
| "loss": 0.7463, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.2083928968518695, |
| "grad_norm": 0.8043299913406372, |
| "learning_rate": 0.00018379918704013556, |
| "loss": 0.6909, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.20902824104958861, |
| "grad_norm": 0.8037667870521545, |
| "learning_rate": 0.0001836867209193918, |
| "loss": 0.7307, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.20966358524730772, |
| "grad_norm": 0.9795257449150085, |
| "learning_rate": 0.00018357390043535228, |
| "loss": 0.7625, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.21029892944502684, |
| "grad_norm": 1.0763206481933594, |
| "learning_rate": 0.0001834607260657443, |
| "loss": 0.7457, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.21093427364274595, |
| "grad_norm": 0.8083770275115967, |
| "learning_rate": 0.00018334719828979373, |
| "loss": 0.7398, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.21156961784046507, |
| "grad_norm": 0.8648799657821655, |
| "learning_rate": 0.00018323331758822299, |
| "loss": 0.7392, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.21220496203818418, |
| "grad_norm": 1.322874903678894, |
| "learning_rate": 0.0001831190844432488, |
| "loss": 0.767, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.2128403062359033, |
| "grad_norm": 0.8415853977203369, |
| "learning_rate": 0.00018300449933858034, |
| "loss": 0.7123, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.2134756504336224, |
| "grad_norm": 0.8832991123199463, |
| "learning_rate": 0.00018288956275941713, |
| "loss": 0.7329, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.21411099463134153, |
| "grad_norm": 0.8079715967178345, |
| "learning_rate": 0.00018277427519244692, |
| "loss": 0.6988, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.21474633882906063, |
| "grad_norm": 0.9029518365859985, |
| "learning_rate": 0.00018265863712584377, |
| "loss": 0.6943, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.21538168302677976, |
| "grad_norm": 0.9082062244415283, |
| "learning_rate": 0.0001825426490492658, |
| "loss": 0.7517, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.21601702722449886, |
| "grad_norm": 0.9031996726989746, |
| "learning_rate": 0.00018242631145385329, |
| "loss": 0.7108, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.216652371422218, |
| "grad_norm": 0.9114848375320435, |
| "learning_rate": 0.00018230962483222648, |
| "loss": 0.7151, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.2172877156199371, |
| "grad_norm": 0.8056477308273315, |
| "learning_rate": 0.00018219258967848355, |
| "loss": 0.7154, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.21792305981765622, |
| "grad_norm": 0.9029595255851746, |
| "learning_rate": 0.0001820752064881985, |
| "loss": 0.728, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.21855840401537532, |
| "grad_norm": 0.9304366707801819, |
| "learning_rate": 0.00018195747575841905, |
| "loss": 0.7298, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.21919374821309445, |
| "grad_norm": 1.2549713850021362, |
| "learning_rate": 0.00018183939798766452, |
| "loss": 0.7166, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.21982909241081355, |
| "grad_norm": 0.8609549403190613, |
| "learning_rate": 0.0001817209736759238, |
| "loss": 0.7222, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.22046443660853268, |
| "grad_norm": 0.9668901562690735, |
| "learning_rate": 0.00018160220332465315, |
| "loss": 0.706, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.22109978080625178, |
| "grad_norm": 0.9426187872886658, |
| "learning_rate": 0.00018148308743677407, |
| "loss": 0.7549, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.2217351250039709, |
| "grad_norm": 1.0274590253829956, |
| "learning_rate": 0.00018136362651667123, |
| "loss": 0.7118, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.22237046920169, |
| "grad_norm": 1.0056123733520508, |
| "learning_rate": 0.00018124382107019028, |
| "loss": 0.7284, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.22237046920169, |
| "eval_loss": 0.6820850968360901, |
| "eval_runtime": 44.1137, |
| "eval_samples_per_second": 61.274, |
| "eval_steps_per_second": 30.648, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.22300581339940914, |
| "grad_norm": 1.01372492313385, |
| "learning_rate": 0.0001811236716046358, |
| "loss": 0.7306, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.22364115759712824, |
| "grad_norm": 0.8217781782150269, |
| "learning_rate": 0.000181003178628769, |
| "loss": 0.7216, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.22427650179484737, |
| "grad_norm": 0.9484082460403442, |
| "learning_rate": 0.00018088234265280573, |
| "loss": 0.7164, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.22491184599256647, |
| "grad_norm": 1.2144994735717773, |
| "learning_rate": 0.0001807672312378185, |
| "loss": 0.7248, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.22554719019028557, |
| "grad_norm": 0.9574259519577026, |
| "learning_rate": 0.00018064572788467363, |
| "loss": 0.689, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.2261825343880047, |
| "grad_norm": 0.7626876831054688, |
| "learning_rate": 0.00018052998338935085, |
| "loss": 0.748, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.2268178785857238, |
| "grad_norm": 0.8534376621246338, |
| "learning_rate": 0.00018040781461538648, |
| "loss": 0.6947, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.22745322278344293, |
| "grad_norm": 1.0029544830322266, |
| "learning_rate": 0.00018028530536233676, |
| "loss": 0.7319, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.22808856698116203, |
| "grad_norm": 0.925713300704956, |
| "learning_rate": 0.00018016245614895518, |
| "loss": 0.7092, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.22872391117888116, |
| "grad_norm": 0.8006899952888489, |
| "learning_rate": 0.00018003926749543488, |
| "loss": 0.6879, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.22935925537660026, |
| "grad_norm": 0.8886255025863647, |
| "learning_rate": 0.00017991573992340616, |
| "loss": 0.6784, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.2299945995743194, |
| "grad_norm": 0.8108293414115906, |
| "learning_rate": 0.00017979187395593459, |
| "loss": 0.7094, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.2306299437720385, |
| "grad_norm": 1.0475900173187256, |
| "learning_rate": 0.00017966767011751858, |
| "loss": 0.696, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.23126528796975762, |
| "grad_norm": 0.9214044809341431, |
| "learning_rate": 0.0001795431289340872, |
| "loss": 0.7125, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.23190063216747672, |
| "grad_norm": 0.996101975440979, |
| "learning_rate": 0.00017941825093299802, |
| "loss": 0.6635, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.23253597636519585, |
| "grad_norm": 0.9577082991600037, |
| "learning_rate": 0.00017929303664303482, |
| "loss": 0.6753, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.23317132056291495, |
| "grad_norm": 1.0278524160385132, |
| "learning_rate": 0.00017916748659440533, |
| "loss": 0.7024, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.23380666476063408, |
| "grad_norm": 0.758007287979126, |
| "learning_rate": 0.00017904160131873906, |
| "loss": 0.6877, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.23444200895835318, |
| "grad_norm": 0.8926889300346375, |
| "learning_rate": 0.00017891538134908502, |
| "loss": 0.7123, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.2350773531560723, |
| "grad_norm": 0.8747749924659729, |
| "learning_rate": 0.00017878882721990936, |
| "loss": 0.656, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.2357126973537914, |
| "grad_norm": 1.012324333190918, |
| "learning_rate": 0.00017866193946709327, |
| "loss": 0.6885, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.23634804155151054, |
| "grad_norm": 0.7973082065582275, |
| "learning_rate": 0.00017853471862793068, |
| "loss": 0.6627, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.23698338574922964, |
| "grad_norm": 0.8259735107421875, |
| "learning_rate": 0.00017840716524112582, |
| "loss": 0.6861, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.23761872994694877, |
| "grad_norm": 0.7817295789718628, |
| "learning_rate": 0.00017827927984679113, |
| "loss": 0.6808, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.23825407414466787, |
| "grad_norm": 0.8139945864677429, |
| "learning_rate": 0.00017815106298644495, |
| "loss": 0.6891, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.238889418342387, |
| "grad_norm": 1.0507733821868896, |
| "learning_rate": 0.00017802251520300906, |
| "loss": 0.6936, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.2395247625401061, |
| "grad_norm": 0.929937481880188, |
| "learning_rate": 0.0001778936370408066, |
| "loss": 0.687, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.24016010673782523, |
| "grad_norm": 1.0632777214050293, |
| "learning_rate": 0.00017776442904555962, |
| "loss": 0.6656, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.24079545093554433, |
| "grad_norm": 1.1247339248657227, |
| "learning_rate": 0.00017763489176438686, |
| "loss": 0.6645, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.24143079513326343, |
| "grad_norm": 0.8897901773452759, |
| "learning_rate": 0.00017750502574580135, |
| "loss": 0.6832, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.24206613933098256, |
| "grad_norm": 0.9285283088684082, |
| "learning_rate": 0.00017737483153970816, |
| "loss": 0.6841, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.24270148352870166, |
| "grad_norm": 0.8733476400375366, |
| "learning_rate": 0.00017724430969740196, |
| "loss": 0.6567, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.2433368277264208, |
| "grad_norm": 0.9532790184020996, |
| "learning_rate": 0.0001771134607715649, |
| "loss": 0.6795, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.2439721719241399, |
| "grad_norm": 1.0881035327911377, |
| "learning_rate": 0.00017698228531626398, |
| "loss": 0.693, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.24460751612185902, |
| "grad_norm": 1.0936851501464844, |
| "learning_rate": 0.00017685078388694897, |
| "loss": 0.6852, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.24524286031957812, |
| "grad_norm": 1.0439817905426025, |
| "learning_rate": 0.0001767189570404499, |
| "loss": 0.6746, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.24587820451729725, |
| "grad_norm": 0.8599082231521606, |
| "learning_rate": 0.00017658680533497477, |
| "loss": 0.6719, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.24651354871501635, |
| "grad_norm": 0.9633190035820007, |
| "learning_rate": 0.00017645432933010712, |
| "loss": 0.7091, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.24714889291273548, |
| "grad_norm": 0.8989465236663818, |
| "learning_rate": 0.00017632152958680378, |
| "loss": 0.6649, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.24778423711045458, |
| "grad_norm": 0.8468721508979797, |
| "learning_rate": 0.00017618840666739228, |
| "loss": 0.6789, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.2484195813081737, |
| "grad_norm": 0.8482181429862976, |
| "learning_rate": 0.00017605496113556882, |
| "loss": 0.6902, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.2490549255058928, |
| "grad_norm": 0.8012595176696777, |
| "learning_rate": 0.00017592119355639544, |
| "loss": 0.6733, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.24969026970361194, |
| "grad_norm": 0.8117650151252747, |
| "learning_rate": 0.00017578710449629804, |
| "loss": 0.6916, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.25032561390133107, |
| "grad_norm": 0.9711939096450806, |
| "learning_rate": 0.00017565269452306364, |
| "loss": 0.6701, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.25096095809905017, |
| "grad_norm": 0.8234876394271851, |
| "learning_rate": 0.00017551796420583833, |
| "loss": 0.62, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.25159630229676927, |
| "grad_norm": 0.8263707756996155, |
| "learning_rate": 0.00017538967420545803, |
| "loss": 0.6907, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.25223164649448837, |
| "grad_norm": 1.2548505067825317, |
| "learning_rate": 0.00017525432085959138, |
| "loss": 0.6644, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.2528669906922075, |
| "grad_norm": 1.1948567628860474, |
| "learning_rate": 0.00017511864885660835, |
| "loss": 0.6609, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.25350233488992663, |
| "grad_norm": 0.9310169219970703, |
| "learning_rate": 0.0001749826587709989, |
| "loss": 0.6757, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.25413767908764573, |
| "grad_norm": 0.8832531571388245, |
| "learning_rate": 0.00017484635117859983, |
| "loss": 0.6552, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.25413767908764573, |
| "eval_loss": 0.6333429217338562, |
| "eval_runtime": 44.406, |
| "eval_samples_per_second": 60.87, |
| "eval_steps_per_second": 30.446, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.25477302328536483, |
| "grad_norm": 0.7624004483222961, |
| "learning_rate": 0.00017470972665659245, |
| "loss": 0.6567, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.255408367483084, |
| "grad_norm": 0.9134401082992554, |
| "learning_rate": 0.00017457278578350002, |
| "loss": 0.6681, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.2560437116808031, |
| "grad_norm": 0.9597674608230591, |
| "learning_rate": 0.00017443552913918534, |
| "loss": 0.6818, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.2566790558785222, |
| "grad_norm": 0.961934506893158, |
| "learning_rate": 0.00017429795730484836, |
| "loss": 0.6833, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.2573144000762413, |
| "grad_norm": 0.9118033647537231, |
| "learning_rate": 0.00017416007086302367, |
| "loss": 0.6607, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.2579497442739604, |
| "grad_norm": 0.8447214961051941, |
| "learning_rate": 0.00017402187039757805, |
| "loss": 0.6409, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.25858508847167955, |
| "grad_norm": 1.010040044784546, |
| "learning_rate": 0.0001738833564937079, |
| "loss": 0.6761, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.25922043266939865, |
| "grad_norm": 0.8686466217041016, |
| "learning_rate": 0.00017374452973793693, |
| "loss": 0.6575, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.25985577686711775, |
| "grad_norm": 1.0445839166641235, |
| "learning_rate": 0.00017360539071811356, |
| "loss": 0.667, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.26049112106483685, |
| "grad_norm": 1.1015607118606567, |
| "learning_rate": 0.00017346594002340843, |
| "loss": 0.6468, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.261126465262556, |
| "grad_norm": 1.4550483226776123, |
| "learning_rate": 0.00017332617824431204, |
| "loss": 0.6642, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.2617618094602751, |
| "grad_norm": 0.8968580961227417, |
| "learning_rate": 0.000173186105972632, |
| "loss": 0.6695, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.2623971536579942, |
| "grad_norm": 0.9802786111831665, |
| "learning_rate": 0.00017304572380149078, |
| "loss": 0.6516, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.2630324978557133, |
| "grad_norm": 0.8785617351531982, |
| "learning_rate": 0.00017290503232532305, |
| "loss": 0.6857, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.26366784205343247, |
| "grad_norm": 0.8675135970115662, |
| "learning_rate": 0.00017276403213987323, |
| "loss": 0.6493, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.26430318625115157, |
| "grad_norm": 0.8159687519073486, |
| "learning_rate": 0.0001726227238421929, |
| "loss": 0.6445, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.26493853044887067, |
| "grad_norm": 0.8598359823226929, |
| "learning_rate": 0.00017248110803063833, |
| "loss": 0.6515, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.26557387464658977, |
| "grad_norm": 1.0304324626922607, |
| "learning_rate": 0.00017233918530486792, |
| "loss": 0.6431, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.2662092188443089, |
| "grad_norm": 0.933110773563385, |
| "learning_rate": 0.0001722040749834389, |
| "loss": 0.6958, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.266844563042028, |
| "grad_norm": 0.9690568447113037, |
| "learning_rate": 0.0001720615555046345, |
| "loss": 0.5922, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.26747990723974713, |
| "grad_norm": 0.9293822646141052, |
| "learning_rate": 0.0001719187308881687, |
| "loss": 0.6407, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.26811525143746623, |
| "grad_norm": 0.8957870602607727, |
| "learning_rate": 0.00017177560173881846, |
| "loss": 0.662, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.2687505956351854, |
| "grad_norm": 1.0288225412368774, |
| "learning_rate": 0.0001716321686626503, |
| "loss": 0.6395, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.2693859398329045, |
| "grad_norm": 0.838657021522522, |
| "learning_rate": 0.00017148843226701764, |
| "loss": 0.6313, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.2700212840306236, |
| "grad_norm": 0.8575971722602844, |
| "learning_rate": 0.00017134439316055834, |
| "loss": 0.6655, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.2706566282283427, |
| "grad_norm": 0.9840354919433594, |
| "learning_rate": 0.00017120005195319195, |
| "loss": 0.6646, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.27129197242606184, |
| "grad_norm": 0.8279704451560974, |
| "learning_rate": 0.00017105540925611737, |
| "loss": 0.6259, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.27192731662378095, |
| "grad_norm": 1.0609900951385498, |
| "learning_rate": 0.00017091046568180996, |
| "loss": 0.6561, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.27256266082150005, |
| "grad_norm": 0.890514612197876, |
| "learning_rate": 0.0001707652218440193, |
| "loss": 0.6324, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.27319800501921915, |
| "grad_norm": 0.9357948303222656, |
| "learning_rate": 0.0001706196783577663, |
| "loss": 0.6116, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.27383334921693825, |
| "grad_norm": 0.9577456116676331, |
| "learning_rate": 0.0001704738358393407, |
| "loss": 0.6764, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.2744686934146574, |
| "grad_norm": 0.834900438785553, |
| "learning_rate": 0.0001703276949062985, |
| "loss": 0.6324, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.2751040376123765, |
| "grad_norm": 0.8283354043960571, |
| "learning_rate": 0.00017018125617745933, |
| "loss": 0.6187, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.2757393818100956, |
| "grad_norm": 0.854200541973114, |
| "learning_rate": 0.00017003452027290373, |
| "loss": 0.6294, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.2763747260078147, |
| "grad_norm": 0.8695046901702881, |
| "learning_rate": 0.00016988748781397064, |
| "loss": 0.6377, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.27701007020553386, |
| "grad_norm": 0.7802212238311768, |
| "learning_rate": 0.00016974015942325475, |
| "loss": 0.6051, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.27764541440325297, |
| "grad_norm": 1.0842890739440918, |
| "learning_rate": 0.00016959253572460382, |
| "loss": 0.6352, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.27828075860097207, |
| "grad_norm": 0.8472367525100708, |
| "learning_rate": 0.0001694446173431161, |
| "loss": 0.5907, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.27891610279869117, |
| "grad_norm": 0.8548029661178589, |
| "learning_rate": 0.0001692964049051376, |
| "loss": 0.6434, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.2795514469964103, |
| "grad_norm": 0.9771581888198853, |
| "learning_rate": 0.00016914789903825945, |
| "loss": 0.6381, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.2801867911941294, |
| "grad_norm": 0.9199798703193665, |
| "learning_rate": 0.0001689991003713154, |
| "loss": 0.6589, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.2808221353918485, |
| "grad_norm": 1.0753369331359863, |
| "learning_rate": 0.00016885000953437894, |
| "loss": 0.6413, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.2814574795895676, |
| "grad_norm": 1.0925753116607666, |
| "learning_rate": 0.00016870062715876075, |
| "loss": 0.6234, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.2820928237872868, |
| "grad_norm": 1.0023586750030518, |
| "learning_rate": 0.00016855095387700598, |
| "loss": 0.6104, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.2827281679850059, |
| "grad_norm": 0.9077417254447937, |
| "learning_rate": 0.00016840099032289162, |
| "loss": 0.602, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.283363512182725, |
| "grad_norm": 0.8238940238952637, |
| "learning_rate": 0.00016825073713142374, |
| "loss": 0.6157, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.2839988563804441, |
| "grad_norm": 1.111948847770691, |
| "learning_rate": 0.000168100194938835, |
| "loss": 0.6092, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.28463420057816324, |
| "grad_norm": 1.0630967617034912, |
| "learning_rate": 0.0001679493643825816, |
| "loss": 0.5904, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.28526954477588234, |
| "grad_norm": 0.8827186822891235, |
| "learning_rate": 0.00016779824610134092, |
| "loss": 0.6166, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.28590488897360145, |
| "grad_norm": 0.9229192137718201, |
| "learning_rate": 0.00016764684073500866, |
| "loss": 0.6178, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.28590488897360145, |
| "eval_loss": 0.5966877341270447, |
| "eval_runtime": 44.6044, |
| "eval_samples_per_second": 60.599, |
| "eval_steps_per_second": 30.311, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.28654023317132055, |
| "grad_norm": 0.8136707544326782, |
| "learning_rate": 0.00016749514892469615, |
| "loss": 0.6366, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.2871755773690397, |
| "grad_norm": 0.8175415992736816, |
| "learning_rate": 0.00016734317131272762, |
| "loss": 0.6177, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.2878109215667588, |
| "grad_norm": 0.929182767868042, |
| "learning_rate": 0.00016719090854263753, |
| "loss": 0.646, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.2884462657644779, |
| "grad_norm": 0.9779849052429199, |
| "learning_rate": 0.0001670383612591678, |
| "loss": 0.6362, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.289081609962197, |
| "grad_norm": 0.8542407751083374, |
| "learning_rate": 0.00016688553010826506, |
| "loss": 0.6076, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.2897169541599161, |
| "grad_norm": 0.8885607719421387, |
| "learning_rate": 0.00016673241573707804, |
| "loss": 0.6055, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.29035229835763526, |
| "grad_norm": 0.876097559928894, |
| "learning_rate": 0.0001665790187939546, |
| "loss": 0.6196, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.29098764255535436, |
| "grad_norm": 1.0198227167129517, |
| "learning_rate": 0.0001664253399284393, |
| "loss": 0.6374, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.29162298675307347, |
| "grad_norm": 0.8938513994216919, |
| "learning_rate": 0.00016627137979127033, |
| "loss": 0.6254, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.29225833095079257, |
| "grad_norm": 0.7427443861961365, |
| "learning_rate": 0.00016611713903437692, |
| "loss": 0.6099, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.2928936751485117, |
| "grad_norm": 0.9959378242492676, |
| "learning_rate": 0.00016596261831087661, |
| "loss": 0.648, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.2935290193462308, |
| "grad_norm": 1.048519253730774, |
| "learning_rate": 0.00016580781827507242, |
| "loss": 0.6292, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.2941643635439499, |
| "grad_norm": 0.858858585357666, |
| "learning_rate": 0.00016565273958245002, |
| "loss": 0.6252, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.294799707741669, |
| "grad_norm": 0.8437022566795349, |
| "learning_rate": 0.00016549738288967514, |
| "loss": 0.6188, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.2954350519393882, |
| "grad_norm": 0.8608834743499756, |
| "learning_rate": 0.00016534174885459056, |
| "loss": 0.6509, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.2960703961371073, |
| "grad_norm": 1.083897590637207, |
| "learning_rate": 0.00016518583813621357, |
| "loss": 0.6193, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.2967057403348264, |
| "grad_norm": 0.9606235027313232, |
| "learning_rate": 0.0001650296513947329, |
| "loss": 0.6287, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.2973410845325455, |
| "grad_norm": 1.0519804954528809, |
| "learning_rate": 0.00016487318929150617, |
| "loss": 0.6097, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.29797642873026464, |
| "grad_norm": 1.3490453958511353, |
| "learning_rate": 0.000164716452489057, |
| "loss": 0.6043, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.29861177292798374, |
| "grad_norm": 1.1292142868041992, |
| "learning_rate": 0.00016455944165107207, |
| "loss": 0.5896, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.29924711712570284, |
| "grad_norm": 0.9570278525352478, |
| "learning_rate": 0.00016440215744239865, |
| "loss": 0.6087, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.29988246132342194, |
| "grad_norm": 0.8570756316184998, |
| "learning_rate": 0.00016424460052904137, |
| "loss": 0.6036, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.3005178055211411, |
| "grad_norm": 0.9214951395988464, |
| "learning_rate": 0.00016408677157815974, |
| "loss": 0.6519, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.3011531497188602, |
| "grad_norm": 1.1580623388290405, |
| "learning_rate": 0.00016392867125806504, |
| "loss": 0.5991, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.3017884939165793, |
| "grad_norm": 1.1025846004486084, |
| "learning_rate": 0.00016377030023821782, |
| "loss": 0.6416, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.3024238381142984, |
| "grad_norm": 0.8918984532356262, |
| "learning_rate": 0.00016361165918922477, |
| "loss": 0.6165, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.30305918231201756, |
| "grad_norm": 0.8747968673706055, |
| "learning_rate": 0.000163452748782836, |
| "loss": 0.6094, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.30369452650973666, |
| "grad_norm": 0.7480270862579346, |
| "learning_rate": 0.0001632935696919422, |
| "loss": 0.5987, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.30432987070745576, |
| "grad_norm": 0.8854328393936157, |
| "learning_rate": 0.00016313412259057178, |
| "loss": 0.6514, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.30496521490517486, |
| "grad_norm": 1.0659030675888062, |
| "learning_rate": 0.00016297440815388802, |
| "loss": 0.5796, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.305600559102894, |
| "grad_norm": 0.9668769240379333, |
| "learning_rate": 0.00016281442705818618, |
| "loss": 0.6147, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.3062359033006131, |
| "grad_norm": 0.939028263092041, |
| "learning_rate": 0.00016265417998089068, |
| "loss": 0.6241, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.3068712474983322, |
| "grad_norm": 0.8955005407333374, |
| "learning_rate": 0.00016249366760055222, |
| "loss": 0.5832, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.3075065916960513, |
| "grad_norm": 0.7991370558738708, |
| "learning_rate": 0.00016233289059684492, |
| "loss": 0.5799, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.3081419358937704, |
| "grad_norm": 0.8115846514701843, |
| "learning_rate": 0.00016217184965056336, |
| "loss": 0.6109, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.3087772800914896, |
| "grad_norm": 0.7488042712211609, |
| "learning_rate": 0.00016201054544361977, |
| "loss": 0.6166, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.3094126242892087, |
| "grad_norm": 0.8463062644004822, |
| "learning_rate": 0.00016184897865904123, |
| "loss": 0.5779, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.3100479684869278, |
| "grad_norm": 1.083001732826233, |
| "learning_rate": 0.00016168714998096654, |
| "loss": 0.6175, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.3106833126846469, |
| "grad_norm": 0.8545092940330505, |
| "learning_rate": 0.00016152506009464357, |
| "loss": 0.6104, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.31131865688236604, |
| "grad_norm": 0.9297589063644409, |
| "learning_rate": 0.00016136270968642618, |
| "loss": 0.5831, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.31195400108008514, |
| "grad_norm": 0.7775977253913879, |
| "learning_rate": 0.0001612000994437714, |
| "loss": 0.6001, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.31258934527780424, |
| "grad_norm": 0.943267822265625, |
| "learning_rate": 0.0001610372300552366, |
| "loss": 0.6089, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.31322468947552334, |
| "grad_norm": 0.8398995399475098, |
| "learning_rate": 0.0001608741022104763, |
| "loss": 0.5929, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.3138600336732425, |
| "grad_norm": 1.0078269243240356, |
| "learning_rate": 0.00016071071660023954, |
| "loss": 0.6215, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.3144953778709616, |
| "grad_norm": 0.9710105657577515, |
| "learning_rate": 0.0001605470739163669, |
| "loss": 0.5983, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.3151307220686807, |
| "grad_norm": 0.8864800333976746, |
| "learning_rate": 0.00016038317485178734, |
| "loss": 0.5812, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.3157660662663998, |
| "grad_norm": 0.9775105118751526, |
| "learning_rate": 0.0001602190201005156, |
| "loss": 0.5899, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.31640141046411896, |
| "grad_norm": 0.8554601669311523, |
| "learning_rate": 0.00016005461035764902, |
| "loss": 0.5989, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.31703675466183806, |
| "grad_norm": 0.8149896264076233, |
| "learning_rate": 0.0001598899463193647, |
| "loss": 0.6383, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.31767209885955716, |
| "grad_norm": 1.1985602378845215, |
| "learning_rate": 0.00015972502868291652, |
| "loss": 0.604, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.31767209885955716, |
| "eval_loss": 0.5633410811424255, |
| "eval_runtime": 44.2566, |
| "eval_samples_per_second": 61.076, |
| "eval_steps_per_second": 30.549, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.31830744305727626, |
| "grad_norm": 0.9848890900611877, |
| "learning_rate": 0.0001595598581466322, |
| "loss": 0.5741, |
| "step": 10020 |
| }, |
| { |
| "epoch": 0.3189427872549954, |
| "grad_norm": 1.0653225183486938, |
| "learning_rate": 0.00015939443540991034, |
| "loss": 0.6154, |
| "step": 10040 |
| }, |
| { |
| "epoch": 0.3195781314527145, |
| "grad_norm": 0.8440039157867432, |
| "learning_rate": 0.0001592287611732175, |
| "loss": 0.6077, |
| "step": 10060 |
| }, |
| { |
| "epoch": 0.3202134756504336, |
| "grad_norm": 0.8706631660461426, |
| "learning_rate": 0.00015906283613808508, |
| "loss": 0.6143, |
| "step": 10080 |
| }, |
| { |
| "epoch": 0.3208488198481527, |
| "grad_norm": 1.0338808298110962, |
| "learning_rate": 0.00015889666100710659, |
| "loss": 0.5697, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.3214841640458719, |
| "grad_norm": 0.8499680757522583, |
| "learning_rate": 0.00015873023648393448, |
| "loss": 0.5968, |
| "step": 10120 |
| }, |
| { |
| "epoch": 0.322119508243591, |
| "grad_norm": 1.0106873512268066, |
| "learning_rate": 0.00015856356327327724, |
| "loss": 0.5657, |
| "step": 10140 |
| }, |
| { |
| "epoch": 0.3227548524413101, |
| "grad_norm": 0.9771645665168762, |
| "learning_rate": 0.00015839664208089634, |
| "loss": 0.5989, |
| "step": 10160 |
| }, |
| { |
| "epoch": 0.3233901966390292, |
| "grad_norm": 0.9425153136253357, |
| "learning_rate": 0.0001582294736136035, |
| "loss": 0.6314, |
| "step": 10180 |
| }, |
| { |
| "epoch": 0.3240255408367483, |
| "grad_norm": 1.1419885158538818, |
| "learning_rate": 0.0001580620585792572, |
| "loss": 0.6137, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.32466088503446744, |
| "grad_norm": 0.8356417417526245, |
| "learning_rate": 0.00015789439768676032, |
| "loss": 0.6189, |
| "step": 10220 |
| }, |
| { |
| "epoch": 0.32529622923218654, |
| "grad_norm": 0.9876666069030762, |
| "learning_rate": 0.00015772649164605648, |
| "loss": 0.6069, |
| "step": 10240 |
| }, |
| { |
| "epoch": 0.32593157342990564, |
| "grad_norm": 1.0510075092315674, |
| "learning_rate": 0.0001575583411681276, |
| "loss": 0.5996, |
| "step": 10260 |
| }, |
| { |
| "epoch": 0.32656691762762474, |
| "grad_norm": 0.91109299659729, |
| "learning_rate": 0.00015738994696499055, |
| "loss": 0.5996, |
| "step": 10280 |
| }, |
| { |
| "epoch": 0.3272022618253439, |
| "grad_norm": 0.8995181322097778, |
| "learning_rate": 0.00015722130974969421, |
| "loss": 0.5798, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.327837606023063, |
| "grad_norm": 1.1067475080490112, |
| "learning_rate": 0.00015705243023631652, |
| "loss": 0.5983, |
| "step": 10320 |
| }, |
| { |
| "epoch": 0.3284729502207821, |
| "grad_norm": 1.0324633121490479, |
| "learning_rate": 0.00015688330913996135, |
| "loss": 0.6011, |
| "step": 10340 |
| }, |
| { |
| "epoch": 0.3291082944185012, |
| "grad_norm": 1.0662481784820557, |
| "learning_rate": 0.0001567139471767556, |
| "loss": 0.6254, |
| "step": 10360 |
| }, |
| { |
| "epoch": 0.32974363861622036, |
| "grad_norm": 0.9539555907249451, |
| "learning_rate": 0.00015654434506384607, |
| "loss": 0.6176, |
| "step": 10380 |
| }, |
| { |
| "epoch": 0.33037898281393946, |
| "grad_norm": 0.7341588139533997, |
| "learning_rate": 0.00015637450351939637, |
| "loss": 0.5852, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.33101432701165856, |
| "grad_norm": 0.9077139496803284, |
| "learning_rate": 0.00015620442326258414, |
| "loss": 0.609, |
| "step": 10420 |
| }, |
| { |
| "epoch": 0.33164967120937766, |
| "grad_norm": 1.083999752998352, |
| "learning_rate": 0.00015603410501359766, |
| "loss": 0.5768, |
| "step": 10440 |
| }, |
| { |
| "epoch": 0.3322850154070968, |
| "grad_norm": 0.9190422296524048, |
| "learning_rate": 0.000155863549493633, |
| "loss": 0.5845, |
| "step": 10460 |
| }, |
| { |
| "epoch": 0.3329203596048159, |
| "grad_norm": 1.0731889009475708, |
| "learning_rate": 0.000155692757424891, |
| "loss": 0.5988, |
| "step": 10480 |
| }, |
| { |
| "epoch": 0.333555703802535, |
| "grad_norm": 0.9898316264152527, |
| "learning_rate": 0.00015552172953057407, |
| "loss": 0.5918, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.3341910480002541, |
| "grad_norm": 1.135695219039917, |
| "learning_rate": 0.00015535046653488322, |
| "loss": 0.5882, |
| "step": 10520 |
| }, |
| { |
| "epoch": 0.3348263921979733, |
| "grad_norm": 1.0453022718429565, |
| "learning_rate": 0.000155178969163015, |
| "loss": 0.609, |
| "step": 10540 |
| }, |
| { |
| "epoch": 0.3354617363956924, |
| "grad_norm": 0.9859703183174133, |
| "learning_rate": 0.00015500723814115835, |
| "loss": 0.5899, |
| "step": 10560 |
| }, |
| { |
| "epoch": 0.3360970805934115, |
| "grad_norm": 1.031168818473816, |
| "learning_rate": 0.00015483527419649163, |
| "loss": 0.5987, |
| "step": 10580 |
| }, |
| { |
| "epoch": 0.3367324247911306, |
| "grad_norm": 1.1591908931732178, |
| "learning_rate": 0.00015466307805717951, |
| "loss": 0.6191, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.33736776898884974, |
| "grad_norm": 0.8246921896934509, |
| "learning_rate": 0.00015449065045236977, |
| "loss": 0.6098, |
| "step": 10620 |
| }, |
| { |
| "epoch": 0.33800311318656884, |
| "grad_norm": 0.8392571210861206, |
| "learning_rate": 0.0001543179921121904, |
| "loss": 0.5675, |
| "step": 10640 |
| }, |
| { |
| "epoch": 0.33863845738428794, |
| "grad_norm": 0.8678343892097473, |
| "learning_rate": 0.00015414510376774633, |
| "loss": 0.5721, |
| "step": 10660 |
| }, |
| { |
| "epoch": 0.33927380158200704, |
| "grad_norm": 0.8436061143875122, |
| "learning_rate": 0.00015397198615111653, |
| "loss": 0.5703, |
| "step": 10680 |
| }, |
| { |
| "epoch": 0.33990914577972614, |
| "grad_norm": 0.9926438927650452, |
| "learning_rate": 0.00015379863999535074, |
| "loss": 0.6049, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.3405444899774453, |
| "grad_norm": 1.098764419555664, |
| "learning_rate": 0.00015362506603446637, |
| "loss": 0.6007, |
| "step": 10720 |
| }, |
| { |
| "epoch": 0.3411798341751644, |
| "grad_norm": 1.052038311958313, |
| "learning_rate": 0.00015345126500344554, |
| "loss": 0.5865, |
| "step": 10740 |
| }, |
| { |
| "epoch": 0.3418151783728835, |
| "grad_norm": 0.8772541880607605, |
| "learning_rate": 0.00015327723763823188, |
| "loss": 0.6066, |
| "step": 10760 |
| }, |
| { |
| "epoch": 0.3424505225706026, |
| "grad_norm": 0.7938296794891357, |
| "learning_rate": 0.00015310298467572733, |
| "loss": 0.5467, |
| "step": 10780 |
| }, |
| { |
| "epoch": 0.34308586676832176, |
| "grad_norm": 1.0938440561294556, |
| "learning_rate": 0.00015292850685378915, |
| "loss": 0.5916, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.34372121096604086, |
| "grad_norm": 0.8460657000541687, |
| "learning_rate": 0.00015275380491122672, |
| "loss": 0.603, |
| "step": 10820 |
| }, |
| { |
| "epoch": 0.34435655516375996, |
| "grad_norm": 0.8238389492034912, |
| "learning_rate": 0.00015257887958779854, |
| "loss": 0.5808, |
| "step": 10840 |
| }, |
| { |
| "epoch": 0.34499189936147906, |
| "grad_norm": 0.8064368367195129, |
| "learning_rate": 0.0001524037316242088, |
| "loss": 0.5862, |
| "step": 10860 |
| }, |
| { |
| "epoch": 0.3456272435591982, |
| "grad_norm": 1.2068203687667847, |
| "learning_rate": 0.00015222836176210467, |
| "loss": 0.5694, |
| "step": 10880 |
| }, |
| { |
| "epoch": 0.3462625877569173, |
| "grad_norm": 0.9752914309501648, |
| "learning_rate": 0.00015205277074407266, |
| "loss": 0.5367, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.3468979319546364, |
| "grad_norm": 0.9989959597587585, |
| "learning_rate": 0.00015187695931363602, |
| "loss": 0.5712, |
| "step": 10920 |
| }, |
| { |
| "epoch": 0.3475332761523555, |
| "grad_norm": 0.8734492659568787, |
| "learning_rate": 0.00015170092821525114, |
| "loss": 0.6029, |
| "step": 10940 |
| }, |
| { |
| "epoch": 0.3481686203500747, |
| "grad_norm": 0.8759735822677612, |
| "learning_rate": 0.00015152467819430458, |
| "loss": 0.5676, |
| "step": 10960 |
| }, |
| { |
| "epoch": 0.3488039645477938, |
| "grad_norm": 0.8554444909095764, |
| "learning_rate": 0.00015134820999711, |
| "loss": 0.5664, |
| "step": 10980 |
| }, |
| { |
| "epoch": 0.3494393087455129, |
| "grad_norm": 0.730451762676239, |
| "learning_rate": 0.00015117152437090482, |
| "loss": 0.5735, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.3494393087455129, |
| "eval_loss": 0.5449489951133728, |
| "eval_runtime": 44.9152, |
| "eval_samples_per_second": 60.18, |
| "eval_steps_per_second": 30.101, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.350074652943232, |
| "grad_norm": 0.7964712381362915, |
| "learning_rate": 0.00015099462206384718, |
| "loss": 0.5943, |
| "step": 11020 |
| }, |
| { |
| "epoch": 0.35070999714095114, |
| "grad_norm": 0.809177577495575, |
| "learning_rate": 0.00015081750382501277, |
| "loss": 0.5986, |
| "step": 11040 |
| }, |
| { |
| "epoch": 0.35134534133867024, |
| "grad_norm": 0.9207815527915955, |
| "learning_rate": 0.00015064017040439148, |
| "loss": 0.559, |
| "step": 11060 |
| }, |
| { |
| "epoch": 0.35198068553638934, |
| "grad_norm": 0.9813947677612305, |
| "learning_rate": 0.0001504626225528845, |
| "loss": 0.5529, |
| "step": 11080 |
| }, |
| { |
| "epoch": 0.35261602973410844, |
| "grad_norm": 0.9409967660903931, |
| "learning_rate": 0.00015028486102230105, |
| "loss": 0.5725, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.3532513739318276, |
| "grad_norm": 0.9317089319229126, |
| "learning_rate": 0.000150106886565355, |
| "loss": 0.5568, |
| "step": 11120 |
| }, |
| { |
| "epoch": 0.3538867181295467, |
| "grad_norm": 1.025341510772705, |
| "learning_rate": 0.00014992869993566194, |
| "loss": 0.5555, |
| "step": 11140 |
| }, |
| { |
| "epoch": 0.3545220623272658, |
| "grad_norm": 1.0014809370040894, |
| "learning_rate": 0.00014975030188773585, |
| "loss": 0.5922, |
| "step": 11160 |
| }, |
| { |
| "epoch": 0.3551574065249849, |
| "grad_norm": 0.9769735336303711, |
| "learning_rate": 0.00014957169317698593, |
| "loss": 0.583, |
| "step": 11180 |
| }, |
| { |
| "epoch": 0.355792750722704, |
| "grad_norm": 0.8555041551589966, |
| "learning_rate": 0.0001493928745597134, |
| "loss": 0.5609, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.35642809492042316, |
| "grad_norm": 0.9463367462158203, |
| "learning_rate": 0.0001492138467931084, |
| "loss": 0.5783, |
| "step": 11220 |
| }, |
| { |
| "epoch": 0.35706343911814226, |
| "grad_norm": 0.9429970979690552, |
| "learning_rate": 0.00014903461063524661, |
| "loss": 0.5934, |
| "step": 11240 |
| }, |
| { |
| "epoch": 0.35769878331586136, |
| "grad_norm": 1.4683854579925537, |
| "learning_rate": 0.00014885516684508612, |
| "loss": 0.5939, |
| "step": 11260 |
| }, |
| { |
| "epoch": 0.35833412751358046, |
| "grad_norm": 0.825720489025116, |
| "learning_rate": 0.00014867551618246428, |
| "loss": 0.5685, |
| "step": 11280 |
| }, |
| { |
| "epoch": 0.3589694717112996, |
| "grad_norm": 1.001832127571106, |
| "learning_rate": 0.00014849565940809432, |
| "loss": 0.5837, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.3596048159090187, |
| "grad_norm": 0.9406988024711609, |
| "learning_rate": 0.00014831559728356234, |
| "loss": 0.5864, |
| "step": 11320 |
| }, |
| { |
| "epoch": 0.3602401601067378, |
| "grad_norm": 0.7483388185501099, |
| "learning_rate": 0.00014813533057132393, |
| "loss": 0.5991, |
| "step": 11340 |
| }, |
| { |
| "epoch": 0.3608755043044569, |
| "grad_norm": 0.8849460482597351, |
| "learning_rate": 0.00014795486003470093, |
| "loss": 0.5821, |
| "step": 11360 |
| }, |
| { |
| "epoch": 0.3615108485021761, |
| "grad_norm": 0.7930045127868652, |
| "learning_rate": 0.00014777418643787836, |
| "loss": 0.5395, |
| "step": 11380 |
| }, |
| { |
| "epoch": 0.3621461926998952, |
| "grad_norm": 0.9285226464271545, |
| "learning_rate": 0.000147593310545901, |
| "loss": 0.5713, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.3627815368976143, |
| "grad_norm": 1.0233609676361084, |
| "learning_rate": 0.00014741223312467026, |
| "loss": 0.5875, |
| "step": 11420 |
| }, |
| { |
| "epoch": 0.3634168810953334, |
| "grad_norm": 1.033948302268982, |
| "learning_rate": 0.00014723095494094092, |
| "loss": 0.5993, |
| "step": 11440 |
| }, |
| { |
| "epoch": 0.36405222529305253, |
| "grad_norm": 0.9479451179504395, |
| "learning_rate": 0.00014704947676231784, |
| "loss": 0.571, |
| "step": 11460 |
| }, |
| { |
| "epoch": 0.36468756949077163, |
| "grad_norm": 0.7781844735145569, |
| "learning_rate": 0.0001468677993572528, |
| "loss": 0.5503, |
| "step": 11480 |
| }, |
| { |
| "epoch": 0.36532291368849074, |
| "grad_norm": 0.9249241352081299, |
| "learning_rate": 0.00014668592349504101, |
| "loss": 0.574, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.36595825788620984, |
| "grad_norm": 0.9108446836471558, |
| "learning_rate": 0.00014650384994581824, |
| "loss": 0.557, |
| "step": 11520 |
| }, |
| { |
| "epoch": 0.366593602083929, |
| "grad_norm": 1.0099608898162842, |
| "learning_rate": 0.0001463215794805573, |
| "loss": 0.5605, |
| "step": 11540 |
| }, |
| { |
| "epoch": 0.3672289462816481, |
| "grad_norm": 0.8376953601837158, |
| "learning_rate": 0.00014613911287106467, |
| "loss": 0.538, |
| "step": 11560 |
| }, |
| { |
| "epoch": 0.3678642904793672, |
| "grad_norm": 0.8893873691558838, |
| "learning_rate": 0.00014595645088997757, |
| "loss": 0.5606, |
| "step": 11580 |
| }, |
| { |
| "epoch": 0.3684996346770863, |
| "grad_norm": 1.1310006380081177, |
| "learning_rate": 0.00014577359431076046, |
| "loss": 0.5612, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.36913497887480545, |
| "grad_norm": 0.8577033281326294, |
| "learning_rate": 0.00014559054390770167, |
| "loss": 0.5688, |
| "step": 11620 |
| }, |
| { |
| "epoch": 0.36977032307252455, |
| "grad_norm": 0.9386855959892273, |
| "learning_rate": 0.00014540730045591044, |
| "loss": 0.5614, |
| "step": 11640 |
| }, |
| { |
| "epoch": 0.37040566727024365, |
| "grad_norm": 0.9492216110229492, |
| "learning_rate": 0.00014522386473131332, |
| "loss": 0.5878, |
| "step": 11660 |
| }, |
| { |
| "epoch": 0.37104101146796276, |
| "grad_norm": 0.853327751159668, |
| "learning_rate": 0.00014504023751065115, |
| "loss": 0.5568, |
| "step": 11680 |
| }, |
| { |
| "epoch": 0.37167635566568186, |
| "grad_norm": 0.7977784872055054, |
| "learning_rate": 0.00014485641957147553, |
| "loss": 0.5428, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.372311699863401, |
| "grad_norm": 1.1006829738616943, |
| "learning_rate": 0.00014467241169214567, |
| "loss": 0.559, |
| "step": 11720 |
| }, |
| { |
| "epoch": 0.3729470440611201, |
| "grad_norm": 1.08724045753479, |
| "learning_rate": 0.0001444882146518251, |
| "loss": 0.5642, |
| "step": 11740 |
| }, |
| { |
| "epoch": 0.3735823882588392, |
| "grad_norm": 1.0295459032058716, |
| "learning_rate": 0.00014430382923047831, |
| "loss": 0.5969, |
| "step": 11760 |
| }, |
| { |
| "epoch": 0.3742177324565583, |
| "grad_norm": 1.1096023321151733, |
| "learning_rate": 0.00014411925620886742, |
| "loss": 0.5678, |
| "step": 11780 |
| }, |
| { |
| "epoch": 0.3748530766542775, |
| "grad_norm": 0.9315259456634521, |
| "learning_rate": 0.000143934496368549, |
| "loss": 0.5728, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.3754884208519966, |
| "grad_norm": 0.9581449031829834, |
| "learning_rate": 0.00014374955049187066, |
| "loss": 0.5485, |
| "step": 11820 |
| }, |
| { |
| "epoch": 0.3761237650497157, |
| "grad_norm": 1.472161054611206, |
| "learning_rate": 0.00014356441936196776, |
| "loss": 0.5931, |
| "step": 11840 |
| }, |
| { |
| "epoch": 0.3767591092474348, |
| "grad_norm": 1.0234733819961548, |
| "learning_rate": 0.00014337910376276011, |
| "loss": 0.5635, |
| "step": 11860 |
| }, |
| { |
| "epoch": 0.37739445344515393, |
| "grad_norm": 0.9299212694168091, |
| "learning_rate": 0.00014319360447894862, |
| "loss": 0.5802, |
| "step": 11880 |
| }, |
| { |
| "epoch": 0.37802979764287303, |
| "grad_norm": 0.853388786315918, |
| "learning_rate": 0.00014300792229601198, |
| "loss": 0.5645, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.37866514184059213, |
| "grad_norm": 0.9909472465515137, |
| "learning_rate": 0.0001428220580002034, |
| "loss": 0.5451, |
| "step": 11920 |
| }, |
| { |
| "epoch": 0.37930048603831124, |
| "grad_norm": 0.8121063113212585, |
| "learning_rate": 0.00014263601237854716, |
| "loss": 0.5514, |
| "step": 11940 |
| }, |
| { |
| "epoch": 0.3799358302360304, |
| "grad_norm": 0.9053930044174194, |
| "learning_rate": 0.00014244978621883543, |
| "loss": 0.5371, |
| "step": 11960 |
| }, |
| { |
| "epoch": 0.3805711744337495, |
| "grad_norm": 1.0551111698150635, |
| "learning_rate": 0.00014226338030962475, |
| "loss": 0.5862, |
| "step": 11980 |
| }, |
| { |
| "epoch": 0.3812065186314686, |
| "grad_norm": 0.8897386193275452, |
| "learning_rate": 0.0001420767954402329, |
| "loss": 0.5439, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.3812065186314686, |
| "eval_loss": 0.5259391665458679, |
| "eval_runtime": 45.0289, |
| "eval_samples_per_second": 60.028, |
| "eval_steps_per_second": 30.025, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.3818418628291877, |
| "grad_norm": 0.8436812162399292, |
| "learning_rate": 0.00014189003240073535, |
| "loss": 0.5684, |
| "step": 12020 |
| }, |
| { |
| "epoch": 0.38247720702690685, |
| "grad_norm": 1.2769359350204468, |
| "learning_rate": 0.0001417030919819621, |
| "loss": 0.5483, |
| "step": 12040 |
| }, |
| { |
| "epoch": 0.38311255122462595, |
| "grad_norm": 0.8915470838546753, |
| "learning_rate": 0.0001415159749754942, |
| "loss": 0.5674, |
| "step": 12060 |
| }, |
| { |
| "epoch": 0.38374789542234505, |
| "grad_norm": 1.1026362180709839, |
| "learning_rate": 0.00014132868217366044, |
| "loss": 0.5868, |
| "step": 12080 |
| }, |
| { |
| "epoch": 0.38438323962006415, |
| "grad_norm": 0.92413729429245, |
| "learning_rate": 0.00014114121436953402, |
| "loss": 0.5602, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.3850185838177833, |
| "grad_norm": 0.8880215287208557, |
| "learning_rate": 0.0001409535723569291, |
| "loss": 0.563, |
| "step": 12120 |
| }, |
| { |
| "epoch": 0.3856539280155024, |
| "grad_norm": 0.7865646481513977, |
| "learning_rate": 0.00014076575693039767, |
| "loss": 0.5731, |
| "step": 12140 |
| }, |
| { |
| "epoch": 0.3862892722132215, |
| "grad_norm": 0.8817760348320007, |
| "learning_rate": 0.00014057776888522583, |
| "loss": 0.5205, |
| "step": 12160 |
| }, |
| { |
| "epoch": 0.3869246164109406, |
| "grad_norm": 0.7473212480545044, |
| "learning_rate": 0.0001403896090174307, |
| "loss": 0.5494, |
| "step": 12180 |
| }, |
| { |
| "epoch": 0.3875599606086597, |
| "grad_norm": 0.9429736137390137, |
| "learning_rate": 0.0001402012781237571, |
| "loss": 0.551, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.38819530480637887, |
| "grad_norm": 0.9144492149353027, |
| "learning_rate": 0.00014001277700167382, |
| "loss": 0.529, |
| "step": 12220 |
| }, |
| { |
| "epoch": 0.388830649004098, |
| "grad_norm": 0.8465405702590942, |
| "learning_rate": 0.00013982410644937057, |
| "loss": 0.566, |
| "step": 12240 |
| }, |
| { |
| "epoch": 0.3894659932018171, |
| "grad_norm": 0.8520842790603638, |
| "learning_rate": 0.00013963526726575446, |
| "loss": 0.61, |
| "step": 12260 |
| }, |
| { |
| "epoch": 0.3901013373995362, |
| "grad_norm": 0.8384197354316711, |
| "learning_rate": 0.00013944626025044673, |
| "loss": 0.563, |
| "step": 12280 |
| }, |
| { |
| "epoch": 0.39073668159725533, |
| "grad_norm": 0.9083155989646912, |
| "learning_rate": 0.00013925708620377927, |
| "loss": 0.5433, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.39137202579497443, |
| "grad_norm": 1.0582692623138428, |
| "learning_rate": 0.00013906774592679116, |
| "loss": 0.5368, |
| "step": 12320 |
| }, |
| { |
| "epoch": 0.39200736999269353, |
| "grad_norm": 0.8538171648979187, |
| "learning_rate": 0.00013887824022122537, |
| "loss": 0.5217, |
| "step": 12340 |
| }, |
| { |
| "epoch": 0.39264271419041263, |
| "grad_norm": 0.8264597058296204, |
| "learning_rate": 0.00013868856988952556, |
| "loss": 0.5564, |
| "step": 12360 |
| }, |
| { |
| "epoch": 0.3932780583881318, |
| "grad_norm": 0.8192921280860901, |
| "learning_rate": 0.00013849873573483222, |
| "loss": 0.6058, |
| "step": 12380 |
| }, |
| { |
| "epoch": 0.3939134025858509, |
| "grad_norm": 0.8523415923118591, |
| "learning_rate": 0.00013830873856097964, |
| "loss": 0.5565, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.39454874678357, |
| "grad_norm": 1.0821831226348877, |
| "learning_rate": 0.00013811857917249253, |
| "loss": 0.5617, |
| "step": 12420 |
| }, |
| { |
| "epoch": 0.3951840909812891, |
| "grad_norm": 0.8053098917007446, |
| "learning_rate": 0.00013792825837458225, |
| "loss": 0.579, |
| "step": 12440 |
| }, |
| { |
| "epoch": 0.39581943517900825, |
| "grad_norm": 0.9511120319366455, |
| "learning_rate": 0.00013773777697314378, |
| "loss": 0.5417, |
| "step": 12460 |
| }, |
| { |
| "epoch": 0.39645477937672735, |
| "grad_norm": 1.0273131132125854, |
| "learning_rate": 0.00013754713577475213, |
| "loss": 0.582, |
| "step": 12480 |
| }, |
| { |
| "epoch": 0.39709012357444645, |
| "grad_norm": 1.0347099304199219, |
| "learning_rate": 0.00013735633558665893, |
| "loss": 0.5679, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.39772546777216555, |
| "grad_norm": 1.0762611627578735, |
| "learning_rate": 0.00013716537721678907, |
| "loss": 0.5483, |
| "step": 12520 |
| }, |
| { |
| "epoch": 0.3983608119698847, |
| "grad_norm": 1.4243688583374023, |
| "learning_rate": 0.00013697426147373721, |
| "loss": 0.5558, |
| "step": 12540 |
| }, |
| { |
| "epoch": 0.3989961561676038, |
| "grad_norm": 0.7539466023445129, |
| "learning_rate": 0.00013678298916676445, |
| "loss": 0.5404, |
| "step": 12560 |
| }, |
| { |
| "epoch": 0.3996315003653229, |
| "grad_norm": 0.7736854553222656, |
| "learning_rate": 0.00013659156110579476, |
| "loss": 0.5578, |
| "step": 12580 |
| }, |
| { |
| "epoch": 0.400266844563042, |
| "grad_norm": 0.9489171504974365, |
| "learning_rate": 0.0001363999781014117, |
| "loss": 0.5668, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.40090218876076117, |
| "grad_norm": 0.9692643880844116, |
| "learning_rate": 0.00013621783146979094, |
| "loss": 0.5663, |
| "step": 12620 |
| }, |
| { |
| "epoch": 0.40153753295848027, |
| "grad_norm": 1.0705336332321167, |
| "learning_rate": 0.00013602594865967435, |
| "loss": 0.5293, |
| "step": 12640 |
| }, |
| { |
| "epoch": 0.40217287715619937, |
| "grad_norm": 1.0149205923080444, |
| "learning_rate": 0.00013583391330117533, |
| "loss": 0.5348, |
| "step": 12660 |
| }, |
| { |
| "epoch": 0.40280822135391847, |
| "grad_norm": 0.9088581204414368, |
| "learning_rate": 0.00013564172620744906, |
| "loss": 0.5677, |
| "step": 12680 |
| }, |
| { |
| "epoch": 0.4034435655516376, |
| "grad_norm": 1.1513986587524414, |
| "learning_rate": 0.00013544938819229306, |
| "loss": 0.569, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.40407890974935673, |
| "grad_norm": 0.8725998401641846, |
| "learning_rate": 0.00013525690007014406, |
| "loss": 0.5692, |
| "step": 12720 |
| }, |
| { |
| "epoch": 0.40471425394707583, |
| "grad_norm": 1.0663046836853027, |
| "learning_rate": 0.00013506426265607425, |
| "loss": 0.567, |
| "step": 12740 |
| }, |
| { |
| "epoch": 0.40534959814479493, |
| "grad_norm": 0.9139559864997864, |
| "learning_rate": 0.00013487147676578812, |
| "loss": 0.5465, |
| "step": 12760 |
| }, |
| { |
| "epoch": 0.40598494234251403, |
| "grad_norm": 1.3140777349472046, |
| "learning_rate": 0.00013467854321561878, |
| "loss": 0.5407, |
| "step": 12780 |
| }, |
| { |
| "epoch": 0.4066202865402332, |
| "grad_norm": 0.8671903610229492, |
| "learning_rate": 0.00013448546282252458, |
| "loss": 0.5303, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.4072556307379523, |
| "grad_norm": 0.692545473575592, |
| "learning_rate": 0.00013429223640408578, |
| "loss": 0.5333, |
| "step": 12820 |
| }, |
| { |
| "epoch": 0.4078909749356714, |
| "grad_norm": 1.1087654829025269, |
| "learning_rate": 0.00013409886477850087, |
| "loss": 0.5493, |
| "step": 12840 |
| }, |
| { |
| "epoch": 0.4085263191333905, |
| "grad_norm": 0.9659181833267212, |
| "learning_rate": 0.00013390534876458319, |
| "loss": 0.5902, |
| "step": 12860 |
| }, |
| { |
| "epoch": 0.40916166333110965, |
| "grad_norm": 0.7794270515441895, |
| "learning_rate": 0.00013371168918175754, |
| "loss": 0.5647, |
| "step": 12880 |
| }, |
| { |
| "epoch": 0.40979700752882875, |
| "grad_norm": 0.910505473613739, |
| "learning_rate": 0.00013351788685005662, |
| "loss": 0.5752, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.41043235172654785, |
| "grad_norm": 0.9549837112426758, |
| "learning_rate": 0.00013332394259011758, |
| "loss": 0.5424, |
| "step": 12920 |
| }, |
| { |
| "epoch": 0.41106769592426695, |
| "grad_norm": 1.2679826021194458, |
| "learning_rate": 0.00013312985722317862, |
| "loss": 0.5285, |
| "step": 12940 |
| }, |
| { |
| "epoch": 0.4117030401219861, |
| "grad_norm": 0.8822807669639587, |
| "learning_rate": 0.0001329356315710753, |
| "loss": 0.5662, |
| "step": 12960 |
| }, |
| { |
| "epoch": 0.4123383843197052, |
| "grad_norm": 0.8247064352035522, |
| "learning_rate": 0.0001327412664562373, |
| "loss": 0.5338, |
| "step": 12980 |
| }, |
| { |
| "epoch": 0.4129737285174243, |
| "grad_norm": 0.8655696511268616, |
| "learning_rate": 0.0001325467627016849, |
| "loss": 0.5563, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.4129737285174243, |
| "eval_loss": 0.5103311538696289, |
| "eval_runtime": 44.4811, |
| "eval_samples_per_second": 60.767, |
| "eval_steps_per_second": 30.395, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.4136090727151434, |
| "grad_norm": 1.1745620965957642, |
| "learning_rate": 0.00013235212113102532, |
| "loss": 0.5432, |
| "step": 13020 |
| }, |
| { |
| "epoch": 0.41424441691286257, |
| "grad_norm": 1.375957727432251, |
| "learning_rate": 0.0001321573425684494, |
| "loss": 0.5518, |
| "step": 13040 |
| }, |
| { |
| "epoch": 0.41487976111058167, |
| "grad_norm": 1.2425376176834106, |
| "learning_rate": 0.00013196242783872805, |
| "loss": 0.5667, |
| "step": 13060 |
| }, |
| { |
| "epoch": 0.41551510530830077, |
| "grad_norm": 0.9375765919685364, |
| "learning_rate": 0.00013176737776720876, |
| "loss": 0.5629, |
| "step": 13080 |
| }, |
| { |
| "epoch": 0.41615044950601987, |
| "grad_norm": 0.9392895698547363, |
| "learning_rate": 0.00013157219317981217, |
| "loss": 0.5577, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.416785793703739, |
| "grad_norm": 0.9028527140617371, |
| "learning_rate": 0.00013137687490302844, |
| "loss": 0.5358, |
| "step": 13120 |
| }, |
| { |
| "epoch": 0.41742113790145813, |
| "grad_norm": 0.9373983144760132, |
| "learning_rate": 0.00013118142376391381, |
| "loss": 0.5517, |
| "step": 13140 |
| }, |
| { |
| "epoch": 0.41805648209917723, |
| "grad_norm": 1.3339825868606567, |
| "learning_rate": 0.00013098584059008725, |
| "loss": 0.5512, |
| "step": 13160 |
| }, |
| { |
| "epoch": 0.41869182629689633, |
| "grad_norm": 0.7137243747711182, |
| "learning_rate": 0.00013079012620972663, |
| "loss": 0.5464, |
| "step": 13180 |
| }, |
| { |
| "epoch": 0.41932717049461543, |
| "grad_norm": 1.1450612545013428, |
| "learning_rate": 0.00013059428145156555, |
| "loss": 0.564, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.4199625146923346, |
| "grad_norm": 1.2148438692092896, |
| "learning_rate": 0.00013039830714488965, |
| "loss": 0.5555, |
| "step": 13220 |
| }, |
| { |
| "epoch": 0.4205978588900537, |
| "grad_norm": 1.277346134185791, |
| "learning_rate": 0.00013020220411953304, |
| "loss": 0.5898, |
| "step": 13240 |
| }, |
| { |
| "epoch": 0.4212332030877728, |
| "grad_norm": 1.0933984518051147, |
| "learning_rate": 0.00013000597320587492, |
| "loss": 0.553, |
| "step": 13260 |
| }, |
| { |
| "epoch": 0.4218685472854919, |
| "grad_norm": 0.7297493815422058, |
| "learning_rate": 0.00012980961523483616, |
| "loss": 0.5626, |
| "step": 13280 |
| }, |
| { |
| "epoch": 0.42250389148321105, |
| "grad_norm": 0.8859849572181702, |
| "learning_rate": 0.00012961313103787548, |
| "loss": 0.5455, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.42313923568093015, |
| "grad_norm": 0.9647216200828552, |
| "learning_rate": 0.00012941652144698608, |
| "loss": 0.5157, |
| "step": 13320 |
| }, |
| { |
| "epoch": 0.42377457987864925, |
| "grad_norm": 0.9097155332565308, |
| "learning_rate": 0.00012921978729469222, |
| "loss": 0.542, |
| "step": 13340 |
| }, |
| { |
| "epoch": 0.42440992407636835, |
| "grad_norm": 1.0074721574783325, |
| "learning_rate": 0.0001290229294140456, |
| "loss": 0.5319, |
| "step": 13360 |
| }, |
| { |
| "epoch": 0.4250452682740875, |
| "grad_norm": 0.7759230732917786, |
| "learning_rate": 0.0001288259486386218, |
| "loss": 0.4939, |
| "step": 13380 |
| }, |
| { |
| "epoch": 0.4256806124718066, |
| "grad_norm": 0.8912795782089233, |
| "learning_rate": 0.00012862884580251675, |
| "loss": 0.5276, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.4263159566695257, |
| "grad_norm": 1.090395450592041, |
| "learning_rate": 0.00012843162174034332, |
| "loss": 0.5227, |
| "step": 13420 |
| }, |
| { |
| "epoch": 0.4269513008672448, |
| "grad_norm": 0.8524248003959656, |
| "learning_rate": 0.00012823427728722762, |
| "loss": 0.5438, |
| "step": 13440 |
| }, |
| { |
| "epoch": 0.42758664506496397, |
| "grad_norm": 1.209073543548584, |
| "learning_rate": 0.0001280368132788056, |
| "loss": 0.5495, |
| "step": 13460 |
| }, |
| { |
| "epoch": 0.42822198926268307, |
| "grad_norm": 0.9301733374595642, |
| "learning_rate": 0.00012783923055121945, |
| "loss": 0.5411, |
| "step": 13480 |
| }, |
| { |
| "epoch": 0.42885733346040217, |
| "grad_norm": 0.916028618812561, |
| "learning_rate": 0.000127641529941114, |
| "loss": 0.5674, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.42949267765812127, |
| "grad_norm": 0.9181066751480103, |
| "learning_rate": 0.00012744371228563334, |
| "loss": 0.5522, |
| "step": 13520 |
| }, |
| { |
| "epoch": 0.4301280218558404, |
| "grad_norm": 1.2208302021026611, |
| "learning_rate": 0.0001272457784224171, |
| "loss": 0.5428, |
| "step": 13540 |
| }, |
| { |
| "epoch": 0.4307633660535595, |
| "grad_norm": 0.8382121920585632, |
| "learning_rate": 0.00012704772918959706, |
| "loss": 0.5347, |
| "step": 13560 |
| }, |
| { |
| "epoch": 0.4313987102512786, |
| "grad_norm": 0.7942314147949219, |
| "learning_rate": 0.0001268495654257934, |
| "loss": 0.5455, |
| "step": 13580 |
| }, |
| { |
| "epoch": 0.43203405444899773, |
| "grad_norm": 1.0586442947387695, |
| "learning_rate": 0.00012665128797011138, |
| "loss": 0.5588, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.4326693986467169, |
| "grad_norm": 0.9026583433151245, |
| "learning_rate": 0.00012645289766213764, |
| "loss": 0.5448, |
| "step": 13620 |
| }, |
| { |
| "epoch": 0.433304742844436, |
| "grad_norm": 1.107459545135498, |
| "learning_rate": 0.0001262643231052632, |
| "loss": 0.5226, |
| "step": 13640 |
| }, |
| { |
| "epoch": 0.4339400870421551, |
| "grad_norm": 0.7181698679924011, |
| "learning_rate": 0.00012606571515198816, |
| "loss": 0.5587, |
| "step": 13660 |
| }, |
| { |
| "epoch": 0.4345754312398742, |
| "grad_norm": 0.850642740726471, |
| "learning_rate": 0.0001258669968259726, |
| "loss": 0.5514, |
| "step": 13680 |
| }, |
| { |
| "epoch": 0.4352107754375933, |
| "grad_norm": 0.9803110957145691, |
| "learning_rate": 0.00012567811294990802, |
| "loss": 0.5612, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.43584611963531245, |
| "grad_norm": 0.8320556282997131, |
| "learning_rate": 0.00012547918181770158, |
| "loss": 0.5464, |
| "step": 13720 |
| }, |
| { |
| "epoch": 0.43648146383303155, |
| "grad_norm": 0.9645776152610779, |
| "learning_rate": 0.0001252801427963731, |
| "loss": 0.5394, |
| "step": 13740 |
| }, |
| { |
| "epoch": 0.43711680803075065, |
| "grad_norm": 0.981066107749939, |
| "learning_rate": 0.00012508099672873401, |
| "loss": 0.5518, |
| "step": 13760 |
| }, |
| { |
| "epoch": 0.43775215222846975, |
| "grad_norm": 0.950231671333313, |
| "learning_rate": 0.00012488174445804905, |
| "loss": 0.5628, |
| "step": 13780 |
| }, |
| { |
| "epoch": 0.4383874964261889, |
| "grad_norm": 0.7942489981651306, |
| "learning_rate": 0.00012468238682803256, |
| "loss": 0.5682, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.439022840623908, |
| "grad_norm": 0.9598709940910339, |
| "learning_rate": 0.0001244829246828451, |
| "loss": 0.5398, |
| "step": 13820 |
| }, |
| { |
| "epoch": 0.4396581848216271, |
| "grad_norm": 0.9328323602676392, |
| "learning_rate": 0.0001242833588670898, |
| "loss": 0.5465, |
| "step": 13840 |
| }, |
| { |
| "epoch": 0.4402935290193462, |
| "grad_norm": 0.9036662578582764, |
| "learning_rate": 0.00012408369022580865, |
| "loss": 0.5307, |
| "step": 13860 |
| }, |
| { |
| "epoch": 0.44092887321706536, |
| "grad_norm": 1.1593483686447144, |
| "learning_rate": 0.0001238839196044792, |
| "loss": 0.5838, |
| "step": 13880 |
| }, |
| { |
| "epoch": 0.44156421741478447, |
| "grad_norm": 0.9283963441848755, |
| "learning_rate": 0.0001236840478490107, |
| "loss": 0.5112, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.44219956161250357, |
| "grad_norm": 1.1374804973602295, |
| "learning_rate": 0.00012348407580574068, |
| "loss": 0.5616, |
| "step": 13920 |
| }, |
| { |
| "epoch": 0.44283490581022267, |
| "grad_norm": 0.8757379055023193, |
| "learning_rate": 0.00012328400432143143, |
| "loss": 0.5409, |
| "step": 13940 |
| }, |
| { |
| "epoch": 0.4434702500079418, |
| "grad_norm": 0.9971847534179688, |
| "learning_rate": 0.00012308383424326617, |
| "loss": 0.5573, |
| "step": 13960 |
| }, |
| { |
| "epoch": 0.4441055942056609, |
| "grad_norm": 0.8985651135444641, |
| "learning_rate": 0.00012288356641884567, |
| "loss": 0.5602, |
| "step": 13980 |
| }, |
| { |
| "epoch": 0.44474093840338, |
| "grad_norm": 0.8877219557762146, |
| "learning_rate": 0.0001226832016961846, |
| "loss": 0.5418, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.44474093840338, |
| "eval_loss": 0.49767744541168213, |
| "eval_runtime": 45.8378, |
| "eval_samples_per_second": 58.969, |
| "eval_steps_per_second": 29.495, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.4453762826010991, |
| "grad_norm": 0.9760685563087463, |
| "learning_rate": 0.00012248274092370795, |
| "loss": 0.5386, |
| "step": 14020 |
| }, |
| { |
| "epoch": 0.4460116267988183, |
| "grad_norm": 0.9159601330757141, |
| "learning_rate": 0.00012228218495024734, |
| "loss": 0.5658, |
| "step": 14040 |
| }, |
| { |
| "epoch": 0.4466469709965374, |
| "grad_norm": 0.9726976752281189, |
| "learning_rate": 0.00012208153462503764, |
| "loss": 0.5619, |
| "step": 14060 |
| }, |
| { |
| "epoch": 0.4472823151942565, |
| "grad_norm": 0.8647946715354919, |
| "learning_rate": 0.00012188079079771311, |
| "loss": 0.5312, |
| "step": 14080 |
| }, |
| { |
| "epoch": 0.4479176593919756, |
| "grad_norm": 0.8291323781013489, |
| "learning_rate": 0.00012167995431830404, |
| "loss": 0.5555, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.44855300358969474, |
| "grad_norm": 1.1393893957138062, |
| "learning_rate": 0.00012147902603723302, |
| "loss": 0.5368, |
| "step": 14120 |
| }, |
| { |
| "epoch": 0.44918834778741384, |
| "grad_norm": 0.9214714169502258, |
| "learning_rate": 0.00012127800680531129, |
| "loss": 0.5312, |
| "step": 14140 |
| }, |
| { |
| "epoch": 0.44982369198513295, |
| "grad_norm": 0.7314972877502441, |
| "learning_rate": 0.00012107689747373533, |
| "loss": 0.5306, |
| "step": 14160 |
| }, |
| { |
| "epoch": 0.45045903618285205, |
| "grad_norm": 0.9739118218421936, |
| "learning_rate": 0.00012087569889408308, |
| "loss": 0.5474, |
| "step": 14180 |
| }, |
| { |
| "epoch": 0.45109438038057115, |
| "grad_norm": 1.1331558227539062, |
| "learning_rate": 0.00012067441191831035, |
| "loss": 0.5251, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.4517297245782903, |
| "grad_norm": 0.9672099947929382, |
| "learning_rate": 0.00012047303739874733, |
| "loss": 0.5638, |
| "step": 14220 |
| }, |
| { |
| "epoch": 0.4523650687760094, |
| "grad_norm": 0.9430161118507385, |
| "learning_rate": 0.00012027157618809488, |
| "loss": 0.5473, |
| "step": 14240 |
| }, |
| { |
| "epoch": 0.4530004129737285, |
| "grad_norm": 0.9385126233100891, |
| "learning_rate": 0.00012007002913942092, |
| "loss": 0.5305, |
| "step": 14260 |
| }, |
| { |
| "epoch": 0.4536357571714476, |
| "grad_norm": 1.2930362224578857, |
| "learning_rate": 0.00011986839710615689, |
| "loss": 0.5264, |
| "step": 14280 |
| }, |
| { |
| "epoch": 0.45427110136916676, |
| "grad_norm": 1.098981499671936, |
| "learning_rate": 0.00011966668094209401, |
| "loss": 0.5945, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.45490644556688586, |
| "grad_norm": 1.016724944114685, |
| "learning_rate": 0.00011946488150137987, |
| "loss": 0.5423, |
| "step": 14320 |
| }, |
| { |
| "epoch": 0.45554178976460497, |
| "grad_norm": 1.3441358804702759, |
| "learning_rate": 0.00011926299963851455, |
| "loss": 0.5311, |
| "step": 14340 |
| }, |
| { |
| "epoch": 0.45617713396232407, |
| "grad_norm": 0.8672164678573608, |
| "learning_rate": 0.00011906103620834721, |
| "loss": 0.5377, |
| "step": 14360 |
| }, |
| { |
| "epoch": 0.4568124781600432, |
| "grad_norm": 0.8844342231750488, |
| "learning_rate": 0.00011885899206607243, |
| "loss": 0.5539, |
| "step": 14380 |
| }, |
| { |
| "epoch": 0.4574478223577623, |
| "grad_norm": 1.0755807161331177, |
| "learning_rate": 0.00011865686806722647, |
| "loss": 0.5489, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.4580831665554814, |
| "grad_norm": 0.8909132480621338, |
| "learning_rate": 0.00011845466506768379, |
| "loss": 0.5492, |
| "step": 14420 |
| }, |
| { |
| "epoch": 0.4587185107532005, |
| "grad_norm": 0.7222205996513367, |
| "learning_rate": 0.00011826249982356501, |
| "loss": 0.5452, |
| "step": 14440 |
| }, |
| { |
| "epoch": 0.4593538549509197, |
| "grad_norm": 0.8589527606964111, |
| "learning_rate": 0.00011806014523563623, |
| "loss": 0.5553, |
| "step": 14460 |
| }, |
| { |
| "epoch": 0.4599891991486388, |
| "grad_norm": 0.8546582460403442, |
| "learning_rate": 0.00011785771417377567, |
| "loss": 0.518, |
| "step": 14480 |
| }, |
| { |
| "epoch": 0.4606245433463579, |
| "grad_norm": 0.7938315272331238, |
| "learning_rate": 0.00011765520749515795, |
| "loss": 0.5732, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.461259887544077, |
| "grad_norm": 1.030897617340088, |
| "learning_rate": 0.000117452626057278, |
| "loss": 0.5293, |
| "step": 14520 |
| }, |
| { |
| "epoch": 0.46189523174179614, |
| "grad_norm": 0.9275230765342712, |
| "learning_rate": 0.00011724997071794722, |
| "loss": 0.5453, |
| "step": 14540 |
| }, |
| { |
| "epoch": 0.46253057593951524, |
| "grad_norm": 0.8049765825271606, |
| "learning_rate": 0.00011704724233528997, |
| "loss": 0.5237, |
| "step": 14560 |
| }, |
| { |
| "epoch": 0.46316592013723434, |
| "grad_norm": 0.9411914348602295, |
| "learning_rate": 0.00011684444176773994, |
| "loss": 0.5529, |
| "step": 14580 |
| }, |
| { |
| "epoch": 0.46380126433495344, |
| "grad_norm": 1.0553874969482422, |
| "learning_rate": 0.0001166415698740364, |
| "loss": 0.5107, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.4644366085326726, |
| "grad_norm": 1.1203105449676514, |
| "learning_rate": 0.00011643862751322072, |
| "loss": 0.5503, |
| "step": 14620 |
| }, |
| { |
| "epoch": 0.4650719527303917, |
| "grad_norm": 0.9356998801231384, |
| "learning_rate": 0.00011623561554463263, |
| "loss": 0.5388, |
| "step": 14640 |
| }, |
| { |
| "epoch": 0.4657072969281108, |
| "grad_norm": 1.0603325366973877, |
| "learning_rate": 0.00011603253482790657, |
| "loss": 0.5379, |
| "step": 14660 |
| }, |
| { |
| "epoch": 0.4663426411258299, |
| "grad_norm": 0.7650070786476135, |
| "learning_rate": 0.00011582938622296818, |
| "loss": 0.5175, |
| "step": 14680 |
| }, |
| { |
| "epoch": 0.466977985323549, |
| "grad_norm": 1.1926647424697876, |
| "learning_rate": 0.00011562617059003044, |
| "loss": 0.5558, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.46761332952126816, |
| "grad_norm": 0.9466400742530823, |
| "learning_rate": 0.00011542288878959025, |
| "loss": 0.5288, |
| "step": 14720 |
| }, |
| { |
| "epoch": 0.46824867371898726, |
| "grad_norm": 1.036163091659546, |
| "learning_rate": 0.0001152195416824247, |
| "loss": 0.5322, |
| "step": 14740 |
| }, |
| { |
| "epoch": 0.46888401791670636, |
| "grad_norm": 0.8458572626113892, |
| "learning_rate": 0.00011501613012958729, |
| "loss": 0.5358, |
| "step": 14760 |
| }, |
| { |
| "epoch": 0.46951936211442546, |
| "grad_norm": 0.789557695388794, |
| "learning_rate": 0.00011481265499240455, |
| "loss": 0.5067, |
| "step": 14780 |
| }, |
| { |
| "epoch": 0.4701547063121446, |
| "grad_norm": 0.845371425151825, |
| "learning_rate": 0.00011460911713247222, |
| "loss": 0.5433, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.4707900505098637, |
| "grad_norm": 0.8561549782752991, |
| "learning_rate": 0.00011440551741165156, |
| "loss": 0.5362, |
| "step": 14820 |
| }, |
| { |
| "epoch": 0.4714253947075828, |
| "grad_norm": 0.921575665473938, |
| "learning_rate": 0.00011420185669206582, |
| "loss": 0.5093, |
| "step": 14840 |
| }, |
| { |
| "epoch": 0.4720607389053019, |
| "grad_norm": 0.9392147660255432, |
| "learning_rate": 0.0001139981358360966, |
| "loss": 0.5419, |
| "step": 14860 |
| }, |
| { |
| "epoch": 0.4726960831030211, |
| "grad_norm": 0.859464168548584, |
| "learning_rate": 0.00011379435570638002, |
| "loss": 0.5329, |
| "step": 14880 |
| }, |
| { |
| "epoch": 0.4733314273007402, |
| "grad_norm": 0.9370890259742737, |
| "learning_rate": 0.00011359051716580331, |
| "loss": 0.516, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.4739667714984593, |
| "grad_norm": 0.8993077278137207, |
| "learning_rate": 0.00011338662107750098, |
| "loss": 0.4785, |
| "step": 14920 |
| }, |
| { |
| "epoch": 0.4746021156961784, |
| "grad_norm": 0.7652683854103088, |
| "learning_rate": 0.00011318266830485119, |
| "loss": 0.5348, |
| "step": 14940 |
| }, |
| { |
| "epoch": 0.47523745989389754, |
| "grad_norm": 1.0513384342193604, |
| "learning_rate": 0.00011297865971147217, |
| "loss": 0.5181, |
| "step": 14960 |
| }, |
| { |
| "epoch": 0.47587280409161664, |
| "grad_norm": 0.8159809112548828, |
| "learning_rate": 0.00011277459616121851, |
| "loss": 0.5368, |
| "step": 14980 |
| }, |
| { |
| "epoch": 0.47650814828933574, |
| "grad_norm": 1.0844529867172241, |
| "learning_rate": 0.00011257047851817748, |
| "loss": 0.5497, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.47650814828933574, |
| "eval_loss": 0.4893677234649658, |
| "eval_runtime": 45.7511, |
| "eval_samples_per_second": 59.081, |
| "eval_steps_per_second": 29.551, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.47714349248705484, |
| "grad_norm": 0.7700105309486389, |
| "learning_rate": 0.0001123663076466655, |
| "loss": 0.5354, |
| "step": 15020 |
| }, |
| { |
| "epoch": 0.477778836684774, |
| "grad_norm": 0.872631847858429, |
| "learning_rate": 0.0001121620844112242, |
| "loss": 0.5243, |
| "step": 15040 |
| }, |
| { |
| "epoch": 0.4784141808824931, |
| "grad_norm": 1.1037932634353638, |
| "learning_rate": 0.0001119578096766171, |
| "loss": 0.5412, |
| "step": 15060 |
| }, |
| { |
| "epoch": 0.4790495250802122, |
| "grad_norm": 0.9620169997215271, |
| "learning_rate": 0.00011175348430782579, |
| "loss": 0.5137, |
| "step": 15080 |
| }, |
| { |
| "epoch": 0.4796848692779313, |
| "grad_norm": 0.7465859055519104, |
| "learning_rate": 0.0001115491091700461, |
| "loss": 0.5213, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.48032021347565046, |
| "grad_norm": 0.7287941575050354, |
| "learning_rate": 0.00011134468512868479, |
| "loss": 0.5184, |
| "step": 15120 |
| }, |
| { |
| "epoch": 0.48095555767336956, |
| "grad_norm": 0.9596436023712158, |
| "learning_rate": 0.00011114021304935558, |
| "loss": 0.5471, |
| "step": 15140 |
| }, |
| { |
| "epoch": 0.48159090187108866, |
| "grad_norm": 0.869172215461731, |
| "learning_rate": 0.00011093569379787563, |
| "loss": 0.5074, |
| "step": 15160 |
| }, |
| { |
| "epoch": 0.48222624606880776, |
| "grad_norm": 1.0704097747802734, |
| "learning_rate": 0.00011073112824026191, |
| "loss": 0.544, |
| "step": 15180 |
| }, |
| { |
| "epoch": 0.48286159026652686, |
| "grad_norm": 0.896312415599823, |
| "learning_rate": 0.00011052651724272736, |
| "loss": 0.5261, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.483496934464246, |
| "grad_norm": 1.010606288909912, |
| "learning_rate": 0.00011032186167167741, |
| "loss": 0.5112, |
| "step": 15220 |
| }, |
| { |
| "epoch": 0.4841322786619651, |
| "grad_norm": 0.980171263217926, |
| "learning_rate": 0.00011011716239370625, |
| "loss": 0.5414, |
| "step": 15240 |
| }, |
| { |
| "epoch": 0.4847676228596842, |
| "grad_norm": 0.7417489290237427, |
| "learning_rate": 0.00010991242027559301, |
| "loss": 0.5019, |
| "step": 15260 |
| }, |
| { |
| "epoch": 0.4854029670574033, |
| "grad_norm": 0.9232955574989319, |
| "learning_rate": 0.0001097076361842984, |
| "loss": 0.5293, |
| "step": 15280 |
| }, |
| { |
| "epoch": 0.4860383112551225, |
| "grad_norm": 0.8391673564910889, |
| "learning_rate": 0.00010950281098696072, |
| "loss": 0.5397, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.4866736554528416, |
| "grad_norm": 1.0795869827270508, |
| "learning_rate": 0.00010929794555089239, |
| "loss": 0.5293, |
| "step": 15320 |
| }, |
| { |
| "epoch": 0.4873089996505607, |
| "grad_norm": 0.9179370403289795, |
| "learning_rate": 0.00010909304074357627, |
| "loss": 0.5089, |
| "step": 15340 |
| }, |
| { |
| "epoch": 0.4879443438482798, |
| "grad_norm": 0.9346722960472107, |
| "learning_rate": 0.0001088880974326618, |
| "loss": 0.4981, |
| "step": 15360 |
| }, |
| { |
| "epoch": 0.48857968804599894, |
| "grad_norm": 0.9835326075553894, |
| "learning_rate": 0.00010868311648596157, |
| "loss": 0.52, |
| "step": 15380 |
| }, |
| { |
| "epoch": 0.48921503224371804, |
| "grad_norm": 0.8709509968757629, |
| "learning_rate": 0.0001084780987714475, |
| "loss": 0.5507, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.48985037644143714, |
| "grad_norm": 1.0125563144683838, |
| "learning_rate": 0.00010827304515724719, |
| "loss": 0.5522, |
| "step": 15420 |
| }, |
| { |
| "epoch": 0.49048572063915624, |
| "grad_norm": 0.9726683497428894, |
| "learning_rate": 0.00010806795651164026, |
| "loss": 0.5195, |
| "step": 15440 |
| }, |
| { |
| "epoch": 0.4911210648368754, |
| "grad_norm": 0.9348143935203552, |
| "learning_rate": 0.0001078628337030547, |
| "loss": 0.5376, |
| "step": 15460 |
| }, |
| { |
| "epoch": 0.4917564090345945, |
| "grad_norm": 1.247452735900879, |
| "learning_rate": 0.00010765767760006308, |
| "loss": 0.5238, |
| "step": 15480 |
| }, |
| { |
| "epoch": 0.4923917532323136, |
| "grad_norm": 1.2584036588668823, |
| "learning_rate": 0.00010745248907137906, |
| "loss": 0.539, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.4930270974300327, |
| "grad_norm": 0.9565659165382385, |
| "learning_rate": 0.00010724726898585353, |
| "loss": 0.546, |
| "step": 15520 |
| }, |
| { |
| "epoch": 0.49366244162775186, |
| "grad_norm": 0.9646620750427246, |
| "learning_rate": 0.000107042018212471, |
| "loss": 0.5094, |
| "step": 15540 |
| }, |
| { |
| "epoch": 0.49429778582547096, |
| "grad_norm": 0.7045026421546936, |
| "learning_rate": 0.00010683673762034594, |
| "loss": 0.5708, |
| "step": 15560 |
| }, |
| { |
| "epoch": 0.49493313002319006, |
| "grad_norm": 1.1588184833526611, |
| "learning_rate": 0.00010663142807871911, |
| "loss": 0.5681, |
| "step": 15580 |
| }, |
| { |
| "epoch": 0.49556847422090916, |
| "grad_norm": 0.8272905349731445, |
| "learning_rate": 0.00010642609045695382, |
| "loss": 0.5239, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.4962038184186283, |
| "grad_norm": 0.9670738577842712, |
| "learning_rate": 0.00010622072562453234, |
| "loss": 0.486, |
| "step": 15620 |
| }, |
| { |
| "epoch": 0.4968391626163474, |
| "grad_norm": 0.8635004162788391, |
| "learning_rate": 0.00010601533445105205, |
| "loss": 0.5419, |
| "step": 15640 |
| }, |
| { |
| "epoch": 0.4974745068140665, |
| "grad_norm": 1.0769212245941162, |
| "learning_rate": 0.00010580991780622196, |
| "loss": 0.5252, |
| "step": 15660 |
| }, |
| { |
| "epoch": 0.4981098510117856, |
| "grad_norm": 0.9688665270805359, |
| "learning_rate": 0.00010560447655985894, |
| "loss": 0.5559, |
| "step": 15680 |
| }, |
| { |
| "epoch": 0.4987451952095048, |
| "grad_norm": 0.9587375521659851, |
| "learning_rate": 0.00010539901158188398, |
| "loss": 0.5136, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.4993805394072239, |
| "grad_norm": 0.870891273021698, |
| "learning_rate": 0.0001051935237423186, |
| "loss": 0.5274, |
| "step": 15720 |
| }, |
| { |
| "epoch": 0.500015883604943, |
| "grad_norm": 1.1741816997528076, |
| "learning_rate": 0.00010498801391128108, |
| "loss": 0.5274, |
| "step": 15740 |
| }, |
| { |
| "epoch": 0.5006512278026621, |
| "grad_norm": 1.074429988861084, |
| "learning_rate": 0.00010478248295898285, |
| "loss": 0.5049, |
| "step": 15760 |
| }, |
| { |
| "epoch": 0.5012865720003812, |
| "grad_norm": 0.7894431352615356, |
| "learning_rate": 0.00010457693175572483, |
| "loss": 0.5141, |
| "step": 15780 |
| }, |
| { |
| "epoch": 0.5019219161981003, |
| "grad_norm": 0.8638029098510742, |
| "learning_rate": 0.00010437136117189356, |
| "loss": 0.5053, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.5025572603958194, |
| "grad_norm": 0.9749894142150879, |
| "learning_rate": 0.00010416577207795776, |
| "loss": 0.5319, |
| "step": 15820 |
| }, |
| { |
| "epoch": 0.5031926045935385, |
| "grad_norm": 0.9491709470748901, |
| "learning_rate": 0.00010396016534446451, |
| "loss": 0.4968, |
| "step": 15840 |
| }, |
| { |
| "epoch": 0.5038279487912577, |
| "grad_norm": 0.880732536315918, |
| "learning_rate": 0.00010375454184203555, |
| "loss": 0.5292, |
| "step": 15860 |
| }, |
| { |
| "epoch": 0.5044632929889767, |
| "grad_norm": 1.22807776927948, |
| "learning_rate": 0.00010354890244136361, |
| "loss": 0.5228, |
| "step": 15880 |
| }, |
| { |
| "epoch": 0.5050986371866959, |
| "grad_norm": 0.8567366003990173, |
| "learning_rate": 0.00010334324801320881, |
| "loss": 0.558, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.505733981384415, |
| "grad_norm": 0.8203198909759521, |
| "learning_rate": 0.00010313757942839482, |
| "loss": 0.5061, |
| "step": 15920 |
| }, |
| { |
| "epoch": 0.5063693255821341, |
| "grad_norm": 0.9894897937774658, |
| "learning_rate": 0.00010293189755780535, |
| "loss": 0.5322, |
| "step": 15940 |
| }, |
| { |
| "epoch": 0.5070046697798533, |
| "grad_norm": 1.0645695924758911, |
| "learning_rate": 0.0001027262032723803, |
| "loss": 0.536, |
| "step": 15960 |
| }, |
| { |
| "epoch": 0.5076400139775723, |
| "grad_norm": 0.9940254092216492, |
| "learning_rate": 0.0001025204974431121, |
| "loss": 0.5211, |
| "step": 15980 |
| }, |
| { |
| "epoch": 0.5082753581752915, |
| "grad_norm": 0.7856065630912781, |
| "learning_rate": 0.00010231478094104216, |
| "loss": 0.5137, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.5082753581752915, |
| "eval_loss": 0.48191481828689575, |
| "eval_runtime": 44.2211, |
| "eval_samples_per_second": 61.125, |
| "eval_steps_per_second": 30.574, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.5089107023730106, |
| "grad_norm": 0.9363443851470947, |
| "learning_rate": 0.00010210905463725703, |
| "loss": 0.5426, |
| "step": 16020 |
| }, |
| { |
| "epoch": 0.5095460465707297, |
| "grad_norm": 0.8720065355300903, |
| "learning_rate": 0.0001019033194028848, |
| "loss": 0.525, |
| "step": 16040 |
| }, |
| { |
| "epoch": 0.5101813907684488, |
| "grad_norm": 0.9192999005317688, |
| "learning_rate": 0.00010169757610909131, |
| "loss": 0.5265, |
| "step": 16060 |
| }, |
| { |
| "epoch": 0.510816734966168, |
| "grad_norm": 1.089529037475586, |
| "learning_rate": 0.00010149182562707657, |
| "loss": 0.5148, |
| "step": 16080 |
| }, |
| { |
| "epoch": 0.511452079163887, |
| "grad_norm": 0.8161883354187012, |
| "learning_rate": 0.00010128606882807106, |
| "loss": 0.5441, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.5120874233616062, |
| "grad_norm": 0.8635348081588745, |
| "learning_rate": 0.00010108030658333192, |
| "loss": 0.4981, |
| "step": 16120 |
| }, |
| { |
| "epoch": 0.5127227675593252, |
| "grad_norm": 0.9366866946220398, |
| "learning_rate": 0.00010087453976413943, |
| "loss": 0.5155, |
| "step": 16140 |
| }, |
| { |
| "epoch": 0.5133581117570444, |
| "grad_norm": 0.8161008954048157, |
| "learning_rate": 0.00010066876924179321, |
| "loss": 0.5178, |
| "step": 16160 |
| }, |
| { |
| "epoch": 0.5139934559547635, |
| "grad_norm": 1.2926280498504639, |
| "learning_rate": 0.00010046299588760855, |
| "loss": 0.5409, |
| "step": 16180 |
| }, |
| { |
| "epoch": 0.5146288001524826, |
| "grad_norm": 0.9963902235031128, |
| "learning_rate": 0.00010025722057291273, |
| "loss": 0.514, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.5152641443502017, |
| "grad_norm": 0.7572094202041626, |
| "learning_rate": 0.0001000514441690414, |
| "loss": 0.5142, |
| "step": 16220 |
| }, |
| { |
| "epoch": 0.5158994885479208, |
| "grad_norm": 0.7842695713043213, |
| "learning_rate": 9.984566754733471e-05, |
| "loss": 0.5419, |
| "step": 16240 |
| }, |
| { |
| "epoch": 0.5165348327456399, |
| "grad_norm": 0.8259790539741516, |
| "learning_rate": 9.96398915791338e-05, |
| "loss": 0.5053, |
| "step": 16260 |
| }, |
| { |
| "epoch": 0.5171701769433591, |
| "grad_norm": 0.7848758697509766, |
| "learning_rate": 9.943411713577707e-05, |
| "loss": 0.5129, |
| "step": 16280 |
| }, |
| { |
| "epoch": 0.5178055211410781, |
| "grad_norm": 0.9001737236976624, |
| "learning_rate": 9.922834508859636e-05, |
| "loss": 0.5095, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.5184408653387973, |
| "grad_norm": 1.2547895908355713, |
| "learning_rate": 9.90225763089135e-05, |
| "loss": 0.5402, |
| "step": 16320 |
| }, |
| { |
| "epoch": 0.5190762095365165, |
| "grad_norm": 1.0412747859954834, |
| "learning_rate": 9.881681166803634e-05, |
| "loss": 0.5039, |
| "step": 16340 |
| }, |
| { |
| "epoch": 0.5197115537342355, |
| "grad_norm": 0.8408613204956055, |
| "learning_rate": 9.861105203725533e-05, |
| "loss": 0.5256, |
| "step": 16360 |
| }, |
| { |
| "epoch": 0.5203468979319547, |
| "grad_norm": 0.7325016856193542, |
| "learning_rate": 9.840529828783965e-05, |
| "loss": 0.5055, |
| "step": 16380 |
| }, |
| { |
| "epoch": 0.5209822421296737, |
| "grad_norm": 1.3417218923568726, |
| "learning_rate": 9.819955129103355e-05, |
| "loss": 0.5336, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.5216175863273929, |
| "grad_norm": 0.8016658425331116, |
| "learning_rate": 9.799381191805272e-05, |
| "loss": 0.5285, |
| "step": 16420 |
| }, |
| { |
| "epoch": 0.522252930525112, |
| "grad_norm": 0.7678484916687012, |
| "learning_rate": 9.778808104008059e-05, |
| "loss": 0.5243, |
| "step": 16440 |
| }, |
| { |
| "epoch": 0.5228882747228311, |
| "grad_norm": 1.0348572731018066, |
| "learning_rate": 9.760293123314227e-05, |
| "loss": 0.5305, |
| "step": 16460 |
| }, |
| { |
| "epoch": 0.5235236189205502, |
| "grad_norm": 0.891635537147522, |
| "learning_rate": 9.739721889566509e-05, |
| "loss": 0.5258, |
| "step": 16480 |
| }, |
| { |
| "epoch": 0.5241589631182694, |
| "grad_norm": 0.9525818824768066, |
| "learning_rate": 9.719151757941184e-05, |
| "loss": 0.5405, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.5247943073159884, |
| "grad_norm": 0.8067079186439514, |
| "learning_rate": 9.698582815540476e-05, |
| "loss": 0.5058, |
| "step": 16520 |
| }, |
| { |
| "epoch": 0.5254296515137076, |
| "grad_norm": 0.8525674939155579, |
| "learning_rate": 9.678015149461577e-05, |
| "loss": 0.5429, |
| "step": 16540 |
| }, |
| { |
| "epoch": 0.5260649957114266, |
| "grad_norm": 0.9794461727142334, |
| "learning_rate": 9.65744884679627e-05, |
| "loss": 0.5106, |
| "step": 16560 |
| }, |
| { |
| "epoch": 0.5267003399091458, |
| "grad_norm": 0.8107161521911621, |
| "learning_rate": 9.636883994630567e-05, |
| "loss": 0.5124, |
| "step": 16580 |
| }, |
| { |
| "epoch": 0.5273356841068649, |
| "grad_norm": 0.8728024959564209, |
| "learning_rate": 9.61632068004434e-05, |
| "loss": 0.5483, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.527971028304584, |
| "grad_norm": 1.0132850408554077, |
| "learning_rate": 9.595758990110948e-05, |
| "loss": 0.55, |
| "step": 16620 |
| }, |
| { |
| "epoch": 0.5286063725023031, |
| "grad_norm": 1.0854065418243408, |
| "learning_rate": 9.575199011896869e-05, |
| "loss": 0.5022, |
| "step": 16640 |
| }, |
| { |
| "epoch": 0.5292417167000223, |
| "grad_norm": 1.06479012966156, |
| "learning_rate": 9.555668697368233e-05, |
| "loss": 0.4932, |
| "step": 16660 |
| }, |
| { |
| "epoch": 0.5298770608977413, |
| "grad_norm": 1.1619220972061157, |
| "learning_rate": 9.535112307403999e-05, |
| "loss": 0.5377, |
| "step": 16680 |
| }, |
| { |
| "epoch": 0.5305124050954605, |
| "grad_norm": 1.1277661323547363, |
| "learning_rate": 9.514557885961573e-05, |
| "loss": 0.5267, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.5311477492931795, |
| "grad_norm": 1.0196537971496582, |
| "learning_rate": 9.494005520076655e-05, |
| "loss": 0.5203, |
| "step": 16720 |
| }, |
| { |
| "epoch": 0.5317830934908987, |
| "grad_norm": 0.9534218907356262, |
| "learning_rate": 9.473455296776239e-05, |
| "loss": 0.5177, |
| "step": 16740 |
| }, |
| { |
| "epoch": 0.5324184376886179, |
| "grad_norm": 0.9330717325210571, |
| "learning_rate": 9.45290730307826e-05, |
| "loss": 0.55, |
| "step": 16760 |
| }, |
| { |
| "epoch": 0.5330537818863369, |
| "grad_norm": 0.9290218949317932, |
| "learning_rate": 9.43236162599119e-05, |
| "loss": 0.5301, |
| "step": 16780 |
| }, |
| { |
| "epoch": 0.533689126084056, |
| "grad_norm": 0.9842971563339233, |
| "learning_rate": 9.411818352513715e-05, |
| "loss": 0.4928, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.5343244702817751, |
| "grad_norm": 0.9267326593399048, |
| "learning_rate": 9.391277569634329e-05, |
| "loss": 0.5443, |
| "step": 16820 |
| }, |
| { |
| "epoch": 0.5349598144794943, |
| "grad_norm": 0.9270855784416199, |
| "learning_rate": 9.370739364330982e-05, |
| "loss": 0.5132, |
| "step": 16840 |
| }, |
| { |
| "epoch": 0.5355951586772134, |
| "grad_norm": 0.9786942601203918, |
| "learning_rate": 9.35020382357071e-05, |
| "loss": 0.5229, |
| "step": 16860 |
| }, |
| { |
| "epoch": 0.5362305028749325, |
| "grad_norm": 0.8397322297096252, |
| "learning_rate": 9.329671034309269e-05, |
| "loss": 0.5248, |
| "step": 16880 |
| }, |
| { |
| "epoch": 0.5368658470726516, |
| "grad_norm": 0.9696868062019348, |
| "learning_rate": 9.30914108349076e-05, |
| "loss": 0.5635, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.5375011912703708, |
| "grad_norm": 1.1376127004623413, |
| "learning_rate": 9.28861405804727e-05, |
| "loss": 0.548, |
| "step": 16920 |
| }, |
| { |
| "epoch": 0.5381365354680898, |
| "grad_norm": 0.9028751254081726, |
| "learning_rate": 9.268090044898489e-05, |
| "loss": 0.5253, |
| "step": 16940 |
| }, |
| { |
| "epoch": 0.538771879665809, |
| "grad_norm": 0.7549586296081543, |
| "learning_rate": 9.247569130951365e-05, |
| "loss": 0.5119, |
| "step": 16960 |
| }, |
| { |
| "epoch": 0.539407223863528, |
| "grad_norm": 1.002920150756836, |
| "learning_rate": 9.227051403099715e-05, |
| "loss": 0.5383, |
| "step": 16980 |
| }, |
| { |
| "epoch": 0.5400425680612472, |
| "grad_norm": 0.7857794761657715, |
| "learning_rate": 9.206536948223862e-05, |
| "loss": 0.4943, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.5400425680612472, |
| "eval_loss": 0.47516322135925293, |
| "eval_runtime": 44.9681, |
| "eval_samples_per_second": 60.109, |
| "eval_steps_per_second": 30.066, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.5406779122589663, |
| "grad_norm": 0.8384699821472168, |
| "learning_rate": 9.186025853190276e-05, |
| "loss": 0.5005, |
| "step": 17020 |
| }, |
| { |
| "epoch": 0.5413132564566854, |
| "grad_norm": 0.859467089176178, |
| "learning_rate": 9.1655182048512e-05, |
| "loss": 0.486, |
| "step": 17040 |
| }, |
| { |
| "epoch": 0.5419486006544045, |
| "grad_norm": 0.9178836345672607, |
| "learning_rate": 9.145014090044276e-05, |
| "loss": 0.4866, |
| "step": 17060 |
| }, |
| { |
| "epoch": 0.5425839448521237, |
| "grad_norm": 1.5116227865219116, |
| "learning_rate": 9.12451359559219e-05, |
| "loss": 0.5103, |
| "step": 17080 |
| }, |
| { |
| "epoch": 0.5432192890498427, |
| "grad_norm": 0.8251123428344727, |
| "learning_rate": 9.104016808302297e-05, |
| "loss": 0.5403, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.5438546332475619, |
| "grad_norm": 0.8845348358154297, |
| "learning_rate": 9.08352381496625e-05, |
| "loss": 0.5295, |
| "step": 17120 |
| }, |
| { |
| "epoch": 0.5444899774452809, |
| "grad_norm": 0.8761606812477112, |
| "learning_rate": 9.063034702359643e-05, |
| "loss": 0.5175, |
| "step": 17140 |
| }, |
| { |
| "epoch": 0.5451253216430001, |
| "grad_norm": 0.8992062211036682, |
| "learning_rate": 9.042549557241629e-05, |
| "loss": 0.5211, |
| "step": 17160 |
| }, |
| { |
| "epoch": 0.5457606658407193, |
| "grad_norm": 1.0609464645385742, |
| "learning_rate": 9.022068466354573e-05, |
| "loss": 0.5231, |
| "step": 17180 |
| }, |
| { |
| "epoch": 0.5463960100384383, |
| "grad_norm": 1.1660939455032349, |
| "learning_rate": 9.001591516423664e-05, |
| "loss": 0.5097, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.5470313542361575, |
| "grad_norm": 0.8982824683189392, |
| "learning_rate": 8.981118794156556e-05, |
| "loss": 0.499, |
| "step": 17220 |
| }, |
| { |
| "epoch": 0.5476666984338765, |
| "grad_norm": 0.9423658847808838, |
| "learning_rate": 8.960650386243009e-05, |
| "loss": 0.5023, |
| "step": 17240 |
| }, |
| { |
| "epoch": 0.5483020426315957, |
| "grad_norm": 0.781741738319397, |
| "learning_rate": 8.940186379354505e-05, |
| "loss": 0.5098, |
| "step": 17260 |
| }, |
| { |
| "epoch": 0.5489373868293148, |
| "grad_norm": 0.9678505063056946, |
| "learning_rate": 8.919726860143895e-05, |
| "loss": 0.5005, |
| "step": 17280 |
| }, |
| { |
| "epoch": 0.5495727310270339, |
| "grad_norm": 0.9400302171707153, |
| "learning_rate": 8.899271915245028e-05, |
| "loss": 0.537, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.550208075224753, |
| "grad_norm": 0.8072425127029419, |
| "learning_rate": 8.878821631272384e-05, |
| "loss": 0.5073, |
| "step": 17320 |
| }, |
| { |
| "epoch": 0.5508434194224722, |
| "grad_norm": 0.9000498652458191, |
| "learning_rate": 8.858376094820701e-05, |
| "loss": 0.5014, |
| "step": 17340 |
| }, |
| { |
| "epoch": 0.5514787636201912, |
| "grad_norm": 0.9222893118858337, |
| "learning_rate": 8.837935392464621e-05, |
| "loss": 0.5216, |
| "step": 17360 |
| }, |
| { |
| "epoch": 0.5521141078179104, |
| "grad_norm": 0.8468360304832458, |
| "learning_rate": 8.817499610758316e-05, |
| "loss": 0.5282, |
| "step": 17380 |
| }, |
| { |
| "epoch": 0.5527494520156294, |
| "grad_norm": 0.7120311260223389, |
| "learning_rate": 8.797068836235116e-05, |
| "loss": 0.5277, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.5533847962133486, |
| "grad_norm": 0.880155622959137, |
| "learning_rate": 8.776643155407154e-05, |
| "loss": 0.523, |
| "step": 17420 |
| }, |
| { |
| "epoch": 0.5540201404110677, |
| "grad_norm": 1.023587703704834, |
| "learning_rate": 8.756222654764996e-05, |
| "loss": 0.508, |
| "step": 17440 |
| }, |
| { |
| "epoch": 0.5546554846087868, |
| "grad_norm": 0.8903362154960632, |
| "learning_rate": 8.735807420777262e-05, |
| "loss": 0.5165, |
| "step": 17460 |
| }, |
| { |
| "epoch": 0.5552908288065059, |
| "grad_norm": 0.7317694425582886, |
| "learning_rate": 8.715397539890287e-05, |
| "loss": 0.4672, |
| "step": 17480 |
| }, |
| { |
| "epoch": 0.5559261730042251, |
| "grad_norm": 1.0228464603424072, |
| "learning_rate": 8.694993098527723e-05, |
| "loss": 0.5112, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.5565615172019441, |
| "grad_norm": 0.7797629237174988, |
| "learning_rate": 8.674594183090199e-05, |
| "loss": 0.477, |
| "step": 17520 |
| }, |
| { |
| "epoch": 0.5571968613996633, |
| "grad_norm": 0.8488342761993408, |
| "learning_rate": 8.654200879954945e-05, |
| "loss": 0.4993, |
| "step": 17540 |
| }, |
| { |
| "epoch": 0.5578322055973823, |
| "grad_norm": 0.8529194593429565, |
| "learning_rate": 8.63381327547542e-05, |
| "loss": 0.5293, |
| "step": 17560 |
| }, |
| { |
| "epoch": 0.5584675497951015, |
| "grad_norm": 0.9537157416343689, |
| "learning_rate": 8.613431455980955e-05, |
| "loss": 0.5047, |
| "step": 17580 |
| }, |
| { |
| "epoch": 0.5591028939928206, |
| "grad_norm": 0.8697558045387268, |
| "learning_rate": 8.593055507776393e-05, |
| "loss": 0.5293, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.5597382381905397, |
| "grad_norm": 0.8306463360786438, |
| "learning_rate": 8.5726855171417e-05, |
| "loss": 0.5075, |
| "step": 17620 |
| }, |
| { |
| "epoch": 0.5603735823882589, |
| "grad_norm": 0.8880159258842468, |
| "learning_rate": 8.55232157033163e-05, |
| "loss": 0.5149, |
| "step": 17640 |
| }, |
| { |
| "epoch": 0.561008926585978, |
| "grad_norm": 0.9390746355056763, |
| "learning_rate": 8.531963753575334e-05, |
| "loss": 0.5196, |
| "step": 17660 |
| }, |
| { |
| "epoch": 0.561644270783697, |
| "grad_norm": 0.968285322189331, |
| "learning_rate": 8.511612153076015e-05, |
| "loss": 0.5229, |
| "step": 17680 |
| }, |
| { |
| "epoch": 0.5622796149814162, |
| "grad_norm": 0.9114767909049988, |
| "learning_rate": 8.491266855010548e-05, |
| "loss": 0.5008, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.5629149591791353, |
| "grad_norm": 0.9089644551277161, |
| "learning_rate": 8.470927945529123e-05, |
| "loss": 0.4848, |
| "step": 17720 |
| }, |
| { |
| "epoch": 0.5635503033768544, |
| "grad_norm": 0.7264979481697083, |
| "learning_rate": 8.450595510754877e-05, |
| "loss": 0.5155, |
| "step": 17740 |
| }, |
| { |
| "epoch": 0.5641856475745736, |
| "grad_norm": 0.9070448875427246, |
| "learning_rate": 8.430269636783534e-05, |
| "loss": 0.524, |
| "step": 17760 |
| }, |
| { |
| "epoch": 0.5648209917722926, |
| "grad_norm": 0.9725968241691589, |
| "learning_rate": 8.40995040968303e-05, |
| "loss": 0.4925, |
| "step": 17780 |
| }, |
| { |
| "epoch": 0.5654563359700118, |
| "grad_norm": 0.8976007103919983, |
| "learning_rate": 8.389637915493162e-05, |
| "loss": 0.4937, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.5660916801677308, |
| "grad_norm": 0.9926420450210571, |
| "learning_rate": 8.369332240225214e-05, |
| "loss": 0.5181, |
| "step": 17820 |
| }, |
| { |
| "epoch": 0.56672702436545, |
| "grad_norm": 0.852676272392273, |
| "learning_rate": 8.349033469861598e-05, |
| "loss": 0.5175, |
| "step": 17840 |
| }, |
| { |
| "epoch": 0.5673623685631691, |
| "grad_norm": 0.8739320635795593, |
| "learning_rate": 8.328741690355487e-05, |
| "loss": 0.4805, |
| "step": 17860 |
| }, |
| { |
| "epoch": 0.5679977127608882, |
| "grad_norm": 0.9660511016845703, |
| "learning_rate": 8.308456987630449e-05, |
| "loss": 0.5063, |
| "step": 17880 |
| }, |
| { |
| "epoch": 0.5686330569586073, |
| "grad_norm": 0.9321526288986206, |
| "learning_rate": 8.288179447580088e-05, |
| "loss": 0.4994, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.5692684011563265, |
| "grad_norm": 1.0359587669372559, |
| "learning_rate": 8.267909156067685e-05, |
| "loss": 0.5279, |
| "step": 17920 |
| }, |
| { |
| "epoch": 0.5699037453540455, |
| "grad_norm": 0.9722701907157898, |
| "learning_rate": 8.247646198925813e-05, |
| "loss": 0.5061, |
| "step": 17940 |
| }, |
| { |
| "epoch": 0.5705390895517647, |
| "grad_norm": 0.854860782623291, |
| "learning_rate": 8.227390661956006e-05, |
| "loss": 0.4827, |
| "step": 17960 |
| }, |
| { |
| "epoch": 0.5711744337494837, |
| "grad_norm": 0.8997724652290344, |
| "learning_rate": 8.207142630928362e-05, |
| "loss": 0.4978, |
| "step": 17980 |
| }, |
| { |
| "epoch": 0.5718097779472029, |
| "grad_norm": 0.9234896898269653, |
| "learning_rate": 8.186902191581205e-05, |
| "loss": 0.4982, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.5718097779472029, |
| "eval_loss": 0.469827800989151, |
| "eval_runtime": 44.8258, |
| "eval_samples_per_second": 60.3, |
| "eval_steps_per_second": 30.161, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.572445122144922, |
| "grad_norm": 0.8457797169685364, |
| "learning_rate": 8.166669429620712e-05, |
| "loss": 0.5263, |
| "step": 18020 |
| }, |
| { |
| "epoch": 0.5730804663426411, |
| "grad_norm": 0.8909218907356262, |
| "learning_rate": 8.146444430720545e-05, |
| "loss": 0.5045, |
| "step": 18040 |
| }, |
| { |
| "epoch": 0.5737158105403602, |
| "grad_norm": 0.950072705745697, |
| "learning_rate": 8.126227280521503e-05, |
| "loss": 0.5247, |
| "step": 18060 |
| }, |
| { |
| "epoch": 0.5743511547380794, |
| "grad_norm": 0.9507225751876831, |
| "learning_rate": 8.106018064631148e-05, |
| "loss": 0.4851, |
| "step": 18080 |
| }, |
| { |
| "epoch": 0.5749864989357985, |
| "grad_norm": 1.0232789516448975, |
| "learning_rate": 8.085816868623436e-05, |
| "loss": 0.5457, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.5756218431335176, |
| "grad_norm": 1.0967813730239868, |
| "learning_rate": 8.065623778038377e-05, |
| "loss": 0.52, |
| "step": 18120 |
| }, |
| { |
| "epoch": 0.5762571873312367, |
| "grad_norm": 0.7866876125335693, |
| "learning_rate": 8.045438878381649e-05, |
| "loss": 0.5117, |
| "step": 18140 |
| }, |
| { |
| "epoch": 0.5768925315289558, |
| "grad_norm": 0.9325518012046814, |
| "learning_rate": 8.025262255124248e-05, |
| "loss": 0.5415, |
| "step": 18160 |
| }, |
| { |
| "epoch": 0.577527875726675, |
| "grad_norm": 0.8899424076080322, |
| "learning_rate": 8.005093993702133e-05, |
| "loss": 0.4947, |
| "step": 18180 |
| }, |
| { |
| "epoch": 0.578163219924394, |
| "grad_norm": 1.0050842761993408, |
| "learning_rate": 7.984934179515843e-05, |
| "loss": 0.4863, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.5787985641221132, |
| "grad_norm": 0.836564302444458, |
| "learning_rate": 7.964782897930158e-05, |
| "loss": 0.5055, |
| "step": 18220 |
| }, |
| { |
| "epoch": 0.5794339083198322, |
| "grad_norm": 1.032029628753662, |
| "learning_rate": 7.944640234273724e-05, |
| "loss": 0.4919, |
| "step": 18240 |
| }, |
| { |
| "epoch": 0.5800692525175514, |
| "grad_norm": 0.854015588760376, |
| "learning_rate": 7.92450627383869e-05, |
| "loss": 0.5108, |
| "step": 18260 |
| }, |
| { |
| "epoch": 0.5807045967152705, |
| "grad_norm": 1.0629216432571411, |
| "learning_rate": 7.904381101880364e-05, |
| "loss": 0.5312, |
| "step": 18280 |
| }, |
| { |
| "epoch": 0.5813399409129896, |
| "grad_norm": 0.8146398067474365, |
| "learning_rate": 7.884264803616827e-05, |
| "loss": 0.5203, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.5819752851107087, |
| "grad_norm": 1.1307437419891357, |
| "learning_rate": 7.864157464228593e-05, |
| "loss": 0.5325, |
| "step": 18320 |
| }, |
| { |
| "epoch": 0.5826106293084279, |
| "grad_norm": 0.9609930515289307, |
| "learning_rate": 7.844059168858241e-05, |
| "loss": 0.5034, |
| "step": 18340 |
| }, |
| { |
| "epoch": 0.5832459735061469, |
| "grad_norm": 0.8615232110023499, |
| "learning_rate": 7.823970002610048e-05, |
| "loss": 0.522, |
| "step": 18360 |
| }, |
| { |
| "epoch": 0.5838813177038661, |
| "grad_norm": 1.014160394668579, |
| "learning_rate": 7.803890050549641e-05, |
| "loss": 0.5104, |
| "step": 18380 |
| }, |
| { |
| "epoch": 0.5845166619015851, |
| "grad_norm": 1.015424370765686, |
| "learning_rate": 7.78381939770363e-05, |
| "loss": 0.4887, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.5851520060993043, |
| "grad_norm": 1.0072382688522339, |
| "learning_rate": 7.763758129059243e-05, |
| "loss": 0.5242, |
| "step": 18420 |
| }, |
| { |
| "epoch": 0.5857873502970234, |
| "grad_norm": 1.122096300125122, |
| "learning_rate": 7.743706329563971e-05, |
| "loss": 0.5408, |
| "step": 18440 |
| }, |
| { |
| "epoch": 0.5864226944947425, |
| "grad_norm": 0.8347269296646118, |
| "learning_rate": 7.723664084125218e-05, |
| "loss": 0.5112, |
| "step": 18460 |
| }, |
| { |
| "epoch": 0.5870580386924616, |
| "grad_norm": 0.9214980006217957, |
| "learning_rate": 7.703631477609926e-05, |
| "loss": 0.5111, |
| "step": 18480 |
| }, |
| { |
| "epoch": 0.5876933828901808, |
| "grad_norm": 0.8427157402038574, |
| "learning_rate": 7.683608594844218e-05, |
| "loss": 0.5199, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.5883287270878998, |
| "grad_norm": 0.8485844731330872, |
| "learning_rate": 7.663595520613054e-05, |
| "loss": 0.5193, |
| "step": 18520 |
| }, |
| { |
| "epoch": 0.588964071285619, |
| "grad_norm": 0.8761444687843323, |
| "learning_rate": 7.643592339659848e-05, |
| "loss": 0.5044, |
| "step": 18540 |
| }, |
| { |
| "epoch": 0.589599415483338, |
| "grad_norm": 0.9373889565467834, |
| "learning_rate": 7.623599136686133e-05, |
| "loss": 0.493, |
| "step": 18560 |
| }, |
| { |
| "epoch": 0.5902347596810572, |
| "grad_norm": 0.9052358269691467, |
| "learning_rate": 7.603615996351184e-05, |
| "loss": 0.516, |
| "step": 18580 |
| }, |
| { |
| "epoch": 0.5908701038787764, |
| "grad_norm": 0.7757846117019653, |
| "learning_rate": 7.583643003271668e-05, |
| "loss": 0.5043, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.5915054480764954, |
| "grad_norm": 0.7769386172294617, |
| "learning_rate": 7.563680242021285e-05, |
| "loss": 0.5005, |
| "step": 18620 |
| }, |
| { |
| "epoch": 0.5921407922742146, |
| "grad_norm": 0.7892422080039978, |
| "learning_rate": 7.543727797130413e-05, |
| "loss": 0.4982, |
| "step": 18640 |
| }, |
| { |
| "epoch": 0.5927761364719337, |
| "grad_norm": 1.0471646785736084, |
| "learning_rate": 7.524782606964114e-05, |
| "loss": 0.5139, |
| "step": 18660 |
| }, |
| { |
| "epoch": 0.5934114806696528, |
| "grad_norm": 0.7995429039001465, |
| "learning_rate": 7.504850521939017e-05, |
| "loss": 0.4736, |
| "step": 18680 |
| }, |
| { |
| "epoch": 0.5940468248673719, |
| "grad_norm": 0.9799679517745972, |
| "learning_rate": 7.484929002382169e-05, |
| "loss": 0.5033, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.594682169065091, |
| "grad_norm": 0.8607106804847717, |
| "learning_rate": 7.465018132649311e-05, |
| "loss": 0.498, |
| "step": 18720 |
| }, |
| { |
| "epoch": 0.5953175132628101, |
| "grad_norm": 0.9690695405006409, |
| "learning_rate": 7.445117997051085e-05, |
| "loss": 0.4898, |
| "step": 18740 |
| }, |
| { |
| "epoch": 0.5959528574605293, |
| "grad_norm": 1.331871747970581, |
| "learning_rate": 7.425228679852684e-05, |
| "loss": 0.5044, |
| "step": 18760 |
| }, |
| { |
| "epoch": 0.5965882016582483, |
| "grad_norm": 0.9347879886627197, |
| "learning_rate": 7.405350265273492e-05, |
| "loss": 0.5088, |
| "step": 18780 |
| }, |
| { |
| "epoch": 0.5972235458559675, |
| "grad_norm": 0.8495462536811829, |
| "learning_rate": 7.385482837486725e-05, |
| "loss": 0.5078, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.5978588900536865, |
| "grad_norm": 1.318202257156372, |
| "learning_rate": 7.365626480619081e-05, |
| "loss": 0.5014, |
| "step": 18820 |
| }, |
| { |
| "epoch": 0.5984942342514057, |
| "grad_norm": 1.0349724292755127, |
| "learning_rate": 7.345781278750368e-05, |
| "loss": 0.531, |
| "step": 18840 |
| }, |
| { |
| "epoch": 0.5991295784491248, |
| "grad_norm": 1.047760248184204, |
| "learning_rate": 7.326938745831322e-05, |
| "loss": 0.4925, |
| "step": 18860 |
| }, |
| { |
| "epoch": 0.5997649226468439, |
| "grad_norm": 0.874220073223114, |
| "learning_rate": 7.307115537865903e-05, |
| "loss": 0.5056, |
| "step": 18880 |
| }, |
| { |
| "epoch": 0.600400266844563, |
| "grad_norm": 0.738158106803894, |
| "learning_rate": 7.287303732658328e-05, |
| "loss": 0.4938, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.6010356110422822, |
| "grad_norm": 0.8721213936805725, |
| "learning_rate": 7.267503414099758e-05, |
| "loss": 0.5074, |
| "step": 18920 |
| }, |
| { |
| "epoch": 0.6016709552400012, |
| "grad_norm": 0.7241856455802917, |
| "learning_rate": 7.247714666032724e-05, |
| "loss": 0.5045, |
| "step": 18940 |
| }, |
| { |
| "epoch": 0.6023062994377204, |
| "grad_norm": 1.0385938882827759, |
| "learning_rate": 7.227937572250761e-05, |
| "loss": 0.5313, |
| "step": 18960 |
| }, |
| { |
| "epoch": 0.6029416436354395, |
| "grad_norm": 1.8555858135223389, |
| "learning_rate": 7.208172216498046e-05, |
| "loss": 0.4989, |
| "step": 18980 |
| }, |
| { |
| "epoch": 0.6035769878331586, |
| "grad_norm": 0.9453182816505432, |
| "learning_rate": 7.188418682469064e-05, |
| "loss": 0.5146, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.6035769878331586, |
| "eval_loss": 0.46334323287010193, |
| "eval_runtime": 44.8428, |
| "eval_samples_per_second": 60.277, |
| "eval_steps_per_second": 30.15, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.6042123320308778, |
| "grad_norm": 0.9362254738807678, |
| "learning_rate": 7.168677053808237e-05, |
| "loss": 0.5148, |
| "step": 19020 |
| }, |
| { |
| "epoch": 0.6048476762285968, |
| "grad_norm": 1.19162917137146, |
| "learning_rate": 7.148947414109572e-05, |
| "loss": 0.4954, |
| "step": 19040 |
| }, |
| { |
| "epoch": 0.605483020426316, |
| "grad_norm": 0.9854863286018372, |
| "learning_rate": 7.129229846916318e-05, |
| "loss": 0.5173, |
| "step": 19060 |
| }, |
| { |
| "epoch": 0.6061183646240351, |
| "grad_norm": 0.8435449600219727, |
| "learning_rate": 7.109524435720597e-05, |
| "loss": 0.5154, |
| "step": 19080 |
| }, |
| { |
| "epoch": 0.6067537088217542, |
| "grad_norm": 0.920364260673523, |
| "learning_rate": 7.08983126396306e-05, |
| "loss": 0.5092, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.6073890530194733, |
| "grad_norm": 1.2439565658569336, |
| "learning_rate": 7.070150415032527e-05, |
| "loss": 0.511, |
| "step": 19120 |
| }, |
| { |
| "epoch": 0.6080243972171924, |
| "grad_norm": 0.7429732084274292, |
| "learning_rate": 7.050481972265648e-05, |
| "loss": 0.4787, |
| "step": 19140 |
| }, |
| { |
| "epoch": 0.6086597414149115, |
| "grad_norm": 0.6966003179550171, |
| "learning_rate": 7.03082601894653e-05, |
| "loss": 0.5237, |
| "step": 19160 |
| }, |
| { |
| "epoch": 0.6092950856126307, |
| "grad_norm": 0.8211964964866638, |
| "learning_rate": 7.011182638306402e-05, |
| "loss": 0.5349, |
| "step": 19180 |
| }, |
| { |
| "epoch": 0.6099304298103497, |
| "grad_norm": 0.9803711771965027, |
| "learning_rate": 6.991551913523253e-05, |
| "loss": 0.5369, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.6105657740080689, |
| "grad_norm": 0.9161061644554138, |
| "learning_rate": 6.971933927721479e-05, |
| "loss": 0.4993, |
| "step": 19220 |
| }, |
| { |
| "epoch": 0.611201118205788, |
| "grad_norm": 0.9608227014541626, |
| "learning_rate": 6.952328763971537e-05, |
| "loss": 0.4837, |
| "step": 19240 |
| }, |
| { |
| "epoch": 0.6118364624035071, |
| "grad_norm": 0.9438381195068359, |
| "learning_rate": 6.932736505289592e-05, |
| "loss": 0.479, |
| "step": 19260 |
| }, |
| { |
| "epoch": 0.6124718066012262, |
| "grad_norm": 1.571315884590149, |
| "learning_rate": 6.91315723463716e-05, |
| "loss": 0.5417, |
| "step": 19280 |
| }, |
| { |
| "epoch": 0.6131071507989453, |
| "grad_norm": 0.8187804818153381, |
| "learning_rate": 6.893591034920763e-05, |
| "loss": 0.5189, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.6137424949966644, |
| "grad_norm": 0.7617794871330261, |
| "learning_rate": 6.87403798899157e-05, |
| "loss": 0.468, |
| "step": 19320 |
| }, |
| { |
| "epoch": 0.6143778391943836, |
| "grad_norm": 0.8723959922790527, |
| "learning_rate": 6.85449817964506e-05, |
| "loss": 0.5044, |
| "step": 19340 |
| }, |
| { |
| "epoch": 0.6150131833921026, |
| "grad_norm": 0.7760429382324219, |
| "learning_rate": 6.834971689620659e-05, |
| "loss": 0.4922, |
| "step": 19360 |
| }, |
| { |
| "epoch": 0.6156485275898218, |
| "grad_norm": 0.925581693649292, |
| "learning_rate": 6.815458601601392e-05, |
| "loss": 0.5079, |
| "step": 19380 |
| }, |
| { |
| "epoch": 0.6162838717875408, |
| "grad_norm": 0.8069369792938232, |
| "learning_rate": 6.795958998213535e-05, |
| "loss": 0.4995, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.61691921598526, |
| "grad_norm": 1.3501884937286377, |
| "learning_rate": 6.77647296202627e-05, |
| "loss": 0.4906, |
| "step": 19420 |
| }, |
| { |
| "epoch": 0.6175545601829792, |
| "grad_norm": 0.9078099131584167, |
| "learning_rate": 6.75700057555132e-05, |
| "loss": 0.4983, |
| "step": 19440 |
| }, |
| { |
| "epoch": 0.6181899043806982, |
| "grad_norm": 0.7792625427246094, |
| "learning_rate": 6.737541921242619e-05, |
| "loss": 0.4869, |
| "step": 19460 |
| }, |
| { |
| "epoch": 0.6188252485784174, |
| "grad_norm": 0.8952593803405762, |
| "learning_rate": 6.718097081495947e-05, |
| "loss": 0.4975, |
| "step": 19480 |
| }, |
| { |
| "epoch": 0.6194605927761365, |
| "grad_norm": 0.9192362427711487, |
| "learning_rate": 6.698666138648593e-05, |
| "loss": 0.5059, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.6200959369738556, |
| "grad_norm": 0.8911659121513367, |
| "learning_rate": 6.679249174978997e-05, |
| "loss": 0.5014, |
| "step": 19520 |
| }, |
| { |
| "epoch": 0.6207312811715747, |
| "grad_norm": 0.9853730201721191, |
| "learning_rate": 6.659846272706406e-05, |
| "loss": 0.4935, |
| "step": 19540 |
| }, |
| { |
| "epoch": 0.6213666253692938, |
| "grad_norm": 1.3485686779022217, |
| "learning_rate": 6.640457513990527e-05, |
| "loss": 0.5061, |
| "step": 19560 |
| }, |
| { |
| "epoch": 0.6220019695670129, |
| "grad_norm": 0.8757696747779846, |
| "learning_rate": 6.621082980931179e-05, |
| "loss": 0.4869, |
| "step": 19580 |
| }, |
| { |
| "epoch": 0.6226373137647321, |
| "grad_norm": 1.0088223218917847, |
| "learning_rate": 6.601722755567937e-05, |
| "loss": 0.5138, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.6232726579624511, |
| "grad_norm": 0.94034343957901, |
| "learning_rate": 6.582376919879798e-05, |
| "loss": 0.5159, |
| "step": 19620 |
| }, |
| { |
| "epoch": 0.6239080021601703, |
| "grad_norm": 0.834994375705719, |
| "learning_rate": 6.563045555784826e-05, |
| "loss": 0.4862, |
| "step": 19640 |
| }, |
| { |
| "epoch": 0.6245433463578894, |
| "grad_norm": 1.2617956399917603, |
| "learning_rate": 6.543728745139802e-05, |
| "loss": 0.5112, |
| "step": 19660 |
| }, |
| { |
| "epoch": 0.6251786905556085, |
| "grad_norm": 0.8542491793632507, |
| "learning_rate": 6.524426569739892e-05, |
| "loss": 0.5234, |
| "step": 19680 |
| }, |
| { |
| "epoch": 0.6258140347533276, |
| "grad_norm": 1.162606120109558, |
| "learning_rate": 6.505139111318277e-05, |
| "loss": 0.4772, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.6264493789510467, |
| "grad_norm": 1.0025289058685303, |
| "learning_rate": 6.48586645154583e-05, |
| "loss": 0.5212, |
| "step": 19720 |
| }, |
| { |
| "epoch": 0.6270847231487658, |
| "grad_norm": 1.0566537380218506, |
| "learning_rate": 6.466608672030763e-05, |
| "loss": 0.5556, |
| "step": 19740 |
| }, |
| { |
| "epoch": 0.627720067346485, |
| "grad_norm": 1.0380536317825317, |
| "learning_rate": 6.447365854318266e-05, |
| "loss": 0.4827, |
| "step": 19760 |
| }, |
| { |
| "epoch": 0.628355411544204, |
| "grad_norm": 1.0499038696289062, |
| "learning_rate": 6.42813807989019e-05, |
| "loss": 0.5316, |
| "step": 19780 |
| }, |
| { |
| "epoch": 0.6289907557419232, |
| "grad_norm": 0.7457720637321472, |
| "learning_rate": 6.408925430164669e-05, |
| "loss": 0.5055, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.6296260999396422, |
| "grad_norm": 1.2990676164627075, |
| "learning_rate": 6.389727986495813e-05, |
| "loss": 0.5068, |
| "step": 19820 |
| }, |
| { |
| "epoch": 0.6302614441373614, |
| "grad_norm": 0.9500844478607178, |
| "learning_rate": 6.370545830173332e-05, |
| "loss": 0.4889, |
| "step": 19840 |
| }, |
| { |
| "epoch": 0.6308967883350806, |
| "grad_norm": 0.7668824195861816, |
| "learning_rate": 6.351379042422199e-05, |
| "loss": 0.5314, |
| "step": 19860 |
| }, |
| { |
| "epoch": 0.6315321325327996, |
| "grad_norm": 0.9457335472106934, |
| "learning_rate": 6.332227704402321e-05, |
| "loss": 0.4898, |
| "step": 19880 |
| }, |
| { |
| "epoch": 0.6321674767305188, |
| "grad_norm": 0.8252271413803101, |
| "learning_rate": 6.31309189720818e-05, |
| "loss": 0.5045, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.6328028209282379, |
| "grad_norm": 0.9943385720252991, |
| "learning_rate": 6.29397170186849e-05, |
| "loss": 0.5243, |
| "step": 19920 |
| }, |
| { |
| "epoch": 0.633438165125957, |
| "grad_norm": 1.1582151651382446, |
| "learning_rate": 6.27582205051849e-05, |
| "loss": 0.5331, |
| "step": 19940 |
| }, |
| { |
| "epoch": 0.6340735093236761, |
| "grad_norm": 0.9436770677566528, |
| "learning_rate": 6.256732531103176e-05, |
| "loss": 0.4903, |
| "step": 19960 |
| }, |
| { |
| "epoch": 0.6347088535213952, |
| "grad_norm": 0.8253883123397827, |
| "learning_rate": 6.237658862190583e-05, |
| "loss": 0.4934, |
| "step": 19980 |
| }, |
| { |
| "epoch": 0.6353441977191143, |
| "grad_norm": 0.8770557641983032, |
| "learning_rate": 6.21860112454631e-05, |
| "loss": 0.5202, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.6353441977191143, |
| "eval_loss": 0.45828375220298767, |
| "eval_runtime": 44.5614, |
| "eval_samples_per_second": 60.658, |
| "eval_steps_per_second": 30.34, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.6359795419168335, |
| "grad_norm": 1.2218546867370605, |
| "learning_rate": 6.19955939886849e-05, |
| "loss": 0.5171, |
| "step": 20020 |
| }, |
| { |
| "epoch": 0.6366148861145525, |
| "grad_norm": 0.8330618143081665, |
| "learning_rate": 6.180533765787468e-05, |
| "loss": 0.4863, |
| "step": 20040 |
| }, |
| { |
| "epoch": 0.6372502303122717, |
| "grad_norm": 1.0419652462005615, |
| "learning_rate": 6.162474393506114e-05, |
| "loss": 0.5427, |
| "step": 20060 |
| }, |
| { |
| "epoch": 0.6378855745099908, |
| "grad_norm": 0.9472757577896118, |
| "learning_rate": 6.143480372643493e-05, |
| "loss": 0.5245, |
| "step": 20080 |
| }, |
| { |
| "epoch": 0.6385209187077099, |
| "grad_norm": 0.7603405117988586, |
| "learning_rate": 6.12450268183886e-05, |
| "loss": 0.4964, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.639156262905429, |
| "grad_norm": 0.8776742219924927, |
| "learning_rate": 6.105541401451404e-05, |
| "loss": 0.4966, |
| "step": 20120 |
| }, |
| { |
| "epoch": 0.6397916071031481, |
| "grad_norm": 0.8271143436431885, |
| "learning_rate": 6.086596611770831e-05, |
| "loss": 0.5119, |
| "step": 20140 |
| }, |
| { |
| "epoch": 0.6404269513008672, |
| "grad_norm": 1.1509547233581543, |
| "learning_rate": 6.067668393017007e-05, |
| "loss": 0.5031, |
| "step": 20160 |
| }, |
| { |
| "epoch": 0.6410622954985864, |
| "grad_norm": 0.8693366050720215, |
| "learning_rate": 6.048756825339643e-05, |
| "loss": 0.4986, |
| "step": 20180 |
| }, |
| { |
| "epoch": 0.6416976396963054, |
| "grad_norm": 0.949834942817688, |
| "learning_rate": 6.029861988817935e-05, |
| "loss": 0.4921, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.6423329838940246, |
| "grad_norm": 0.9004225730895996, |
| "learning_rate": 6.010983963460233e-05, |
| "loss": 0.5023, |
| "step": 20220 |
| }, |
| { |
| "epoch": 0.6429683280917438, |
| "grad_norm": 0.7829142808914185, |
| "learning_rate": 5.9921228292037026e-05, |
| "loss": 0.507, |
| "step": 20240 |
| }, |
| { |
| "epoch": 0.6436036722894628, |
| "grad_norm": 1.1816707849502563, |
| "learning_rate": 5.973278665913985e-05, |
| "loss": 0.4926, |
| "step": 20260 |
| }, |
| { |
| "epoch": 0.644239016487182, |
| "grad_norm": 0.881648063659668, |
| "learning_rate": 5.9544515533848614e-05, |
| "loss": 0.4885, |
| "step": 20280 |
| }, |
| { |
| "epoch": 0.644874360684901, |
| "grad_norm": 0.9568135738372803, |
| "learning_rate": 5.9356415713379145e-05, |
| "loss": 0.515, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.6455097048826202, |
| "grad_norm": 0.9377472400665283, |
| "learning_rate": 5.9168487994221834e-05, |
| "loss": 0.4886, |
| "step": 20320 |
| }, |
| { |
| "epoch": 0.6461450490803393, |
| "grad_norm": 0.9032811522483826, |
| "learning_rate": 5.898073317213837e-05, |
| "loss": 0.5064, |
| "step": 20340 |
| }, |
| { |
| "epoch": 0.6467803932780584, |
| "grad_norm": 0.9788734316825867, |
| "learning_rate": 5.879315204215836e-05, |
| "loss": 0.4698, |
| "step": 20360 |
| }, |
| { |
| "epoch": 0.6474157374757775, |
| "grad_norm": 1.0353432893753052, |
| "learning_rate": 5.860574539857584e-05, |
| "loss": 0.5227, |
| "step": 20380 |
| }, |
| { |
| "epoch": 0.6480510816734966, |
| "grad_norm": 0.8998845815658569, |
| "learning_rate": 5.84185140349461e-05, |
| "loss": 0.5132, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.6486864258712157, |
| "grad_norm": 0.8317026495933533, |
| "learning_rate": 5.82314587440821e-05, |
| "loss": 0.468, |
| "step": 20420 |
| }, |
| { |
| "epoch": 0.6493217700689349, |
| "grad_norm": 0.7740748524665833, |
| "learning_rate": 5.80445803180514e-05, |
| "loss": 0.5119, |
| "step": 20440 |
| }, |
| { |
| "epoch": 0.6499571142666539, |
| "grad_norm": 1.0922515392303467, |
| "learning_rate": 5.78578795481725e-05, |
| "loss": 0.5284, |
| "step": 20460 |
| }, |
| { |
| "epoch": 0.6505924584643731, |
| "grad_norm": 0.8265649676322937, |
| "learning_rate": 5.76713572250117e-05, |
| "loss": 0.5095, |
| "step": 20480 |
| }, |
| { |
| "epoch": 0.6512278026620922, |
| "grad_norm": 1.0644861459732056, |
| "learning_rate": 5.748501413837963e-05, |
| "loss": 0.5028, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.6518631468598113, |
| "grad_norm": 0.9139828681945801, |
| "learning_rate": 5.729885107732808e-05, |
| "loss": 0.4814, |
| "step": 20520 |
| }, |
| { |
| "epoch": 0.6524984910575304, |
| "grad_norm": 0.7917624115943909, |
| "learning_rate": 5.7112868830146416e-05, |
| "loss": 0.4772, |
| "step": 20540 |
| }, |
| { |
| "epoch": 0.6531338352552495, |
| "grad_norm": 0.7677121162414551, |
| "learning_rate": 5.692706818435836e-05, |
| "loss": 0.519, |
| "step": 20560 |
| }, |
| { |
| "epoch": 0.6537691794529686, |
| "grad_norm": 0.8412395715713501, |
| "learning_rate": 5.674144992671882e-05, |
| "loss": 0.501, |
| "step": 20580 |
| }, |
| { |
| "epoch": 0.6544045236506878, |
| "grad_norm": 1.014061689376831, |
| "learning_rate": 5.655601484321022e-05, |
| "loss": 0.5122, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.6550398678484068, |
| "grad_norm": 1.0746990442276, |
| "learning_rate": 5.6370763719039375e-05, |
| "loss": 0.4969, |
| "step": 20620 |
| }, |
| { |
| "epoch": 0.655675212046126, |
| "grad_norm": 0.9021841883659363, |
| "learning_rate": 5.6185697338634304e-05, |
| "loss": 0.4771, |
| "step": 20640 |
| }, |
| { |
| "epoch": 0.6563105562438452, |
| "grad_norm": 0.8193987607955933, |
| "learning_rate": 5.600081648564056e-05, |
| "loss": 0.5143, |
| "step": 20660 |
| }, |
| { |
| "epoch": 0.6569459004415642, |
| "grad_norm": 1.152421474456787, |
| "learning_rate": 5.581612194291814e-05, |
| "loss": 0.4873, |
| "step": 20680 |
| }, |
| { |
| "epoch": 0.6575812446392834, |
| "grad_norm": 0.8709347248077393, |
| "learning_rate": 5.5631614492538217e-05, |
| "loss": 0.5199, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.6582165888370024, |
| "grad_norm": 0.827723503112793, |
| "learning_rate": 5.544729491577967e-05, |
| "loss": 0.4917, |
| "step": 20720 |
| }, |
| { |
| "epoch": 0.6588519330347216, |
| "grad_norm": 1.5408345460891724, |
| "learning_rate": 5.526316399312579e-05, |
| "loss": 0.5562, |
| "step": 20740 |
| }, |
| { |
| "epoch": 0.6594872772324407, |
| "grad_norm": 0.731490433216095, |
| "learning_rate": 5.507922250426118e-05, |
| "loss": 0.4927, |
| "step": 20760 |
| }, |
| { |
| "epoch": 0.6601226214301598, |
| "grad_norm": 0.950702428817749, |
| "learning_rate": 5.4895471228068185e-05, |
| "loss": 0.5115, |
| "step": 20780 |
| }, |
| { |
| "epoch": 0.6607579656278789, |
| "grad_norm": 0.8342424631118774, |
| "learning_rate": 5.471191094262369e-05, |
| "loss": 0.4856, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.661393309825598, |
| "grad_norm": 0.9297844767570496, |
| "learning_rate": 5.4528542425196004e-05, |
| "loss": 0.4896, |
| "step": 20820 |
| }, |
| { |
| "epoch": 0.6620286540233171, |
| "grad_norm": 0.7558259963989258, |
| "learning_rate": 5.434536645224126e-05, |
| "loss": 0.4895, |
| "step": 20840 |
| }, |
| { |
| "epoch": 0.6626639982210363, |
| "grad_norm": 1.2116395235061646, |
| "learning_rate": 5.416238379940035e-05, |
| "loss": 0.507, |
| "step": 20860 |
| }, |
| { |
| "epoch": 0.6632993424187553, |
| "grad_norm": 0.913467526435852, |
| "learning_rate": 5.39795952414955e-05, |
| "loss": 0.5137, |
| "step": 20880 |
| }, |
| { |
| "epoch": 0.6639346866164745, |
| "grad_norm": 0.868238627910614, |
| "learning_rate": 5.3797001552527184e-05, |
| "loss": 0.5185, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.6645700308141936, |
| "grad_norm": 1.0668286085128784, |
| "learning_rate": 5.361460350567062e-05, |
| "loss": 0.5158, |
| "step": 20920 |
| }, |
| { |
| "epoch": 0.6652053750119127, |
| "grad_norm": 0.795097291469574, |
| "learning_rate": 5.3432401873272655e-05, |
| "loss": 0.4985, |
| "step": 20940 |
| }, |
| { |
| "epoch": 0.6658407192096318, |
| "grad_norm": 0.6949301958084106, |
| "learning_rate": 5.325039742684839e-05, |
| "loss": 0.4722, |
| "step": 20960 |
| }, |
| { |
| "epoch": 0.6664760634073509, |
| "grad_norm": 0.7859952449798584, |
| "learning_rate": 5.3068590937077945e-05, |
| "loss": 0.4933, |
| "step": 20980 |
| }, |
| { |
| "epoch": 0.66711140760507, |
| "grad_norm": 0.8529000282287598, |
| "learning_rate": 5.288698317380334e-05, |
| "loss": 0.5098, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.66711140760507, |
| "eval_loss": 0.45643100142478943, |
| "eval_runtime": 44.6378, |
| "eval_samples_per_second": 60.554, |
| "eval_steps_per_second": 30.288, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.6677467518027892, |
| "grad_norm": 0.9853639602661133, |
| "learning_rate": 5.270557490602499e-05, |
| "loss": 0.4715, |
| "step": 21020 |
| }, |
| { |
| "epoch": 0.6683820960005082, |
| "grad_norm": 0.8387131690979004, |
| "learning_rate": 5.2524366901898566e-05, |
| "loss": 0.5128, |
| "step": 21040 |
| }, |
| { |
| "epoch": 0.6690174401982274, |
| "grad_norm": 0.8610044717788696, |
| "learning_rate": 5.234335992873176e-05, |
| "loss": 0.5424, |
| "step": 21060 |
| }, |
| { |
| "epoch": 0.6696527843959466, |
| "grad_norm": 0.8878015279769897, |
| "learning_rate": 5.216255475298109e-05, |
| "loss": 0.4734, |
| "step": 21080 |
| }, |
| { |
| "epoch": 0.6702881285936656, |
| "grad_norm": 1.0038951635360718, |
| "learning_rate": 5.198195214024848e-05, |
| "loss": 0.4879, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.6709234727913848, |
| "grad_norm": 0.9256641864776611, |
| "learning_rate": 5.1801552855278126e-05, |
| "loss": 0.527, |
| "step": 21120 |
| }, |
| { |
| "epoch": 0.6715588169891038, |
| "grad_norm": 0.7668296098709106, |
| "learning_rate": 5.162135766195337e-05, |
| "loss": 0.5161, |
| "step": 21140 |
| }, |
| { |
| "epoch": 0.672194161186823, |
| "grad_norm": 0.7756738066673279, |
| "learning_rate": 5.144136732329323e-05, |
| "loss": 0.5265, |
| "step": 21160 |
| }, |
| { |
| "epoch": 0.6728295053845421, |
| "grad_norm": 0.9279829859733582, |
| "learning_rate": 5.1261582601449285e-05, |
| "loss": 0.4814, |
| "step": 21180 |
| }, |
| { |
| "epoch": 0.6734648495822612, |
| "grad_norm": 1.1274375915527344, |
| "learning_rate": 5.108200425770255e-05, |
| "loss": 0.5061, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.6741001937799803, |
| "grad_norm": 1.082535982131958, |
| "learning_rate": 5.090263305246006e-05, |
| "loss": 0.5081, |
| "step": 21220 |
| }, |
| { |
| "epoch": 0.6747355379776995, |
| "grad_norm": 1.0355536937713623, |
| "learning_rate": 5.0723469745251725e-05, |
| "loss": 0.5044, |
| "step": 21240 |
| }, |
| { |
| "epoch": 0.6753708821754185, |
| "grad_norm": 0.9309506416320801, |
| "learning_rate": 5.054451509472728e-05, |
| "loss": 0.5241, |
| "step": 21260 |
| }, |
| { |
| "epoch": 0.6760062263731377, |
| "grad_norm": 0.818247377872467, |
| "learning_rate": 5.0365769858652735e-05, |
| "loss": 0.5034, |
| "step": 21280 |
| }, |
| { |
| "epoch": 0.6766415705708567, |
| "grad_norm": 0.8921930193901062, |
| "learning_rate": 5.0187234793907447e-05, |
| "loss": 0.5089, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.6772769147685759, |
| "grad_norm": 0.9915839433670044, |
| "learning_rate": 5.000891065648087e-05, |
| "loss": 0.5049, |
| "step": 21320 |
| }, |
| { |
| "epoch": 0.677912258966295, |
| "grad_norm": 0.8783996105194092, |
| "learning_rate": 4.983079820146922e-05, |
| "loss": 0.5314, |
| "step": 21340 |
| }, |
| { |
| "epoch": 0.6785476031640141, |
| "grad_norm": 0.8735405802726746, |
| "learning_rate": 4.96528981830724e-05, |
| "loss": 0.5036, |
| "step": 21360 |
| }, |
| { |
| "epoch": 0.6791829473617332, |
| "grad_norm": 0.9674988389015198, |
| "learning_rate": 4.947521135459072e-05, |
| "loss": 0.5269, |
| "step": 21380 |
| }, |
| { |
| "epoch": 0.6798182915594523, |
| "grad_norm": 0.9271227717399597, |
| "learning_rate": 4.9297738468421896e-05, |
| "loss": 0.5061, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.6804536357571714, |
| "grad_norm": 0.7828012704849243, |
| "learning_rate": 4.912048027605759e-05, |
| "loss": 0.4978, |
| "step": 21420 |
| }, |
| { |
| "epoch": 0.6810889799548906, |
| "grad_norm": 1.3417547941207886, |
| "learning_rate": 4.8943437528080385e-05, |
| "loss": 0.5326, |
| "step": 21440 |
| }, |
| { |
| "epoch": 0.6817243241526096, |
| "grad_norm": 0.8963372707366943, |
| "learning_rate": 4.876661097416066e-05, |
| "loss": 0.4989, |
| "step": 21460 |
| }, |
| { |
| "epoch": 0.6823596683503288, |
| "grad_norm": 0.893553614616394, |
| "learning_rate": 4.859000136305329e-05, |
| "loss": 0.4859, |
| "step": 21480 |
| }, |
| { |
| "epoch": 0.682995012548048, |
| "grad_norm": 1.2325243949890137, |
| "learning_rate": 4.8413609442594445e-05, |
| "loss": 0.5037, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.683630356745767, |
| "grad_norm": 0.8049502372741699, |
| "learning_rate": 4.8237435959698706e-05, |
| "loss": 0.509, |
| "step": 21520 |
| }, |
| { |
| "epoch": 0.6842657009434862, |
| "grad_norm": 1.2289927005767822, |
| "learning_rate": 4.8061481660355534e-05, |
| "loss": 0.5128, |
| "step": 21540 |
| }, |
| { |
| "epoch": 0.6849010451412052, |
| "grad_norm": 0.8123481869697571, |
| "learning_rate": 4.7885747289626284e-05, |
| "loss": 0.5031, |
| "step": 21560 |
| }, |
| { |
| "epoch": 0.6855363893389244, |
| "grad_norm": 0.8852875232696533, |
| "learning_rate": 4.771023359164116e-05, |
| "loss": 0.4875, |
| "step": 21580 |
| }, |
| { |
| "epoch": 0.6861717335366435, |
| "grad_norm": 0.8462742567062378, |
| "learning_rate": 4.753494130959586e-05, |
| "loss": 0.4787, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.6868070777343626, |
| "grad_norm": 0.99876868724823, |
| "learning_rate": 4.7359871185748485e-05, |
| "loss": 0.5116, |
| "step": 21620 |
| }, |
| { |
| "epoch": 0.6874424219320817, |
| "grad_norm": 0.9393181204795837, |
| "learning_rate": 4.718502396141656e-05, |
| "loss": 0.4878, |
| "step": 21640 |
| }, |
| { |
| "epoch": 0.6880777661298009, |
| "grad_norm": 0.8426542282104492, |
| "learning_rate": 4.701040037697364e-05, |
| "loss": 0.4897, |
| "step": 21660 |
| }, |
| { |
| "epoch": 0.6887131103275199, |
| "grad_norm": 0.938210666179657, |
| "learning_rate": 4.683600117184631e-05, |
| "loss": 0.492, |
| "step": 21680 |
| }, |
| { |
| "epoch": 0.6893484545252391, |
| "grad_norm": 0.8325148820877075, |
| "learning_rate": 4.666182708451114e-05, |
| "loss": 0.4842, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.6899837987229581, |
| "grad_norm": 0.8813055753707886, |
| "learning_rate": 4.648787885249136e-05, |
| "loss": 0.491, |
| "step": 21720 |
| }, |
| { |
| "epoch": 0.6906191429206773, |
| "grad_norm": 1.0838825702667236, |
| "learning_rate": 4.631415721235389e-05, |
| "loss": 0.4732, |
| "step": 21740 |
| }, |
| { |
| "epoch": 0.6912544871183964, |
| "grad_norm": 0.7203667163848877, |
| "learning_rate": 4.614066289970609e-05, |
| "loss": 0.4692, |
| "step": 21760 |
| }, |
| { |
| "epoch": 0.6918898313161155, |
| "grad_norm": 1.181038737297058, |
| "learning_rate": 4.596739664919287e-05, |
| "loss": 0.5177, |
| "step": 21780 |
| }, |
| { |
| "epoch": 0.6925251755138346, |
| "grad_norm": 0.9107904434204102, |
| "learning_rate": 4.579435919449332e-05, |
| "loss": 0.5186, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.6931605197115537, |
| "grad_norm": 0.8281117081642151, |
| "learning_rate": 4.5621551268317686e-05, |
| "loss": 0.4848, |
| "step": 21820 |
| }, |
| { |
| "epoch": 0.6937958639092728, |
| "grad_norm": 0.9180241227149963, |
| "learning_rate": 4.545759700573378e-05, |
| "loss": 0.4979, |
| "step": 21840 |
| }, |
| { |
| "epoch": 0.694431208106992, |
| "grad_norm": 0.912675678730011, |
| "learning_rate": 4.5285238763954426e-05, |
| "loss": 0.5124, |
| "step": 21860 |
| }, |
| { |
| "epoch": 0.695066552304711, |
| "grad_norm": 0.8163600564002991, |
| "learning_rate": 4.5113112206520056e-05, |
| "loss": 0.5205, |
| "step": 21880 |
| }, |
| { |
| "epoch": 0.6957018965024302, |
| "grad_norm": 0.7308365702629089, |
| "learning_rate": 4.494121806228392e-05, |
| "loss": 0.5208, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.6963372407001494, |
| "grad_norm": 0.7426006197929382, |
| "learning_rate": 4.476955705911504e-05, |
| "loss": 0.48, |
| "step": 21920 |
| }, |
| { |
| "epoch": 0.6969725848978684, |
| "grad_norm": 0.9886866807937622, |
| "learning_rate": 4.459812992389526e-05, |
| "loss": 0.5483, |
| "step": 21940 |
| }, |
| { |
| "epoch": 0.6976079290955876, |
| "grad_norm": 0.9653937816619873, |
| "learning_rate": 4.44269373825162e-05, |
| "loss": 0.4613, |
| "step": 21960 |
| }, |
| { |
| "epoch": 0.6982432732933066, |
| "grad_norm": 0.8184491991996765, |
| "learning_rate": 4.425598015987602e-05, |
| "loss": 0.5212, |
| "step": 21980 |
| }, |
| { |
| "epoch": 0.6988786174910258, |
| "grad_norm": 0.9365077614784241, |
| "learning_rate": 4.408525897987645e-05, |
| "loss": 0.4868, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.6988786174910258, |
| "eval_loss": 0.45187339186668396, |
| "eval_runtime": 44.7631, |
| "eval_samples_per_second": 60.385, |
| "eval_steps_per_second": 30.203, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.6995139616887449, |
| "grad_norm": 0.9188706874847412, |
| "learning_rate": 4.391477456541983e-05, |
| "loss": 0.4991, |
| "step": 22020 |
| }, |
| { |
| "epoch": 0.700149305886464, |
| "grad_norm": 0.8599129319190979, |
| "learning_rate": 4.374452763840584e-05, |
| "loss": 0.5184, |
| "step": 22040 |
| }, |
| { |
| "epoch": 0.7007846500841831, |
| "grad_norm": 0.8643587827682495, |
| "learning_rate": 4.357451891972854e-05, |
| "loss": 0.4966, |
| "step": 22060 |
| }, |
| { |
| "epoch": 0.7014199942819023, |
| "grad_norm": 0.9123074412345886, |
| "learning_rate": 4.340474912927332e-05, |
| "loss": 0.5068, |
| "step": 22080 |
| }, |
| { |
| "epoch": 0.7020553384796213, |
| "grad_norm": 0.8422294855117798, |
| "learning_rate": 4.323521898591394e-05, |
| "loss": 0.4753, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.7026906826773405, |
| "grad_norm": 0.8830937743186951, |
| "learning_rate": 4.306592920750931e-05, |
| "loss": 0.4837, |
| "step": 22120 |
| }, |
| { |
| "epoch": 0.7033260268750595, |
| "grad_norm": 0.8540763854980469, |
| "learning_rate": 4.289688051090054e-05, |
| "loss": 0.4733, |
| "step": 22140 |
| }, |
| { |
| "epoch": 0.7039613710727787, |
| "grad_norm": 0.8622573614120483, |
| "learning_rate": 4.272807361190797e-05, |
| "loss": 0.5003, |
| "step": 22160 |
| }, |
| { |
| "epoch": 0.7045967152704978, |
| "grad_norm": 0.9827342629432678, |
| "learning_rate": 4.2559509225328e-05, |
| "loss": 0.5333, |
| "step": 22180 |
| }, |
| { |
| "epoch": 0.7052320594682169, |
| "grad_norm": 0.8439646363258362, |
| "learning_rate": 4.239118806493013e-05, |
| "loss": 0.4778, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.705867403665936, |
| "grad_norm": 0.9348493814468384, |
| "learning_rate": 4.222311084345405e-05, |
| "loss": 0.4806, |
| "step": 22220 |
| }, |
| { |
| "epoch": 0.7065027478636552, |
| "grad_norm": 1.0671905279159546, |
| "learning_rate": 4.2055278272606404e-05, |
| "loss": 0.4978, |
| "step": 22240 |
| }, |
| { |
| "epoch": 0.7071380920613742, |
| "grad_norm": 1.2363934516906738, |
| "learning_rate": 4.188769106305787e-05, |
| "loss": 0.5089, |
| "step": 22260 |
| }, |
| { |
| "epoch": 0.7077734362590934, |
| "grad_norm": 0.9339464902877808, |
| "learning_rate": 4.1720349924440295e-05, |
| "loss": 0.4796, |
| "step": 22280 |
| }, |
| { |
| "epoch": 0.7084087804568124, |
| "grad_norm": 0.873092770576477, |
| "learning_rate": 4.155325556534345e-05, |
| "loss": 0.4931, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.7090441246545316, |
| "grad_norm": 0.7866622805595398, |
| "learning_rate": 4.138640869331215e-05, |
| "loss": 0.501, |
| "step": 22320 |
| }, |
| { |
| "epoch": 0.7096794688522507, |
| "grad_norm": 1.0133357048034668, |
| "learning_rate": 4.121981001484334e-05, |
| "loss": 0.481, |
| "step": 22340 |
| }, |
| { |
| "epoch": 0.7103148130499698, |
| "grad_norm": 0.9386391043663025, |
| "learning_rate": 4.105346023538292e-05, |
| "loss": 0.5303, |
| "step": 22360 |
| }, |
| { |
| "epoch": 0.710950157247689, |
| "grad_norm": 0.7917353510856628, |
| "learning_rate": 4.088736005932289e-05, |
| "loss": 0.4993, |
| "step": 22380 |
| }, |
| { |
| "epoch": 0.711585501445408, |
| "grad_norm": 0.9757121801376343, |
| "learning_rate": 4.0721510189998266e-05, |
| "loss": 0.5102, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.7122208456431272, |
| "grad_norm": 1.2196959257125854, |
| "learning_rate": 4.055591132968432e-05, |
| "loss": 0.5045, |
| "step": 22420 |
| }, |
| { |
| "epoch": 0.7128561898408463, |
| "grad_norm": 1.0833863019943237, |
| "learning_rate": 4.039056417959328e-05, |
| "loss": 0.5136, |
| "step": 22440 |
| }, |
| { |
| "epoch": 0.7134915340385654, |
| "grad_norm": 0.7548487186431885, |
| "learning_rate": 4.02254694398716e-05, |
| "loss": 0.4864, |
| "step": 22460 |
| }, |
| { |
| "epoch": 0.7141268782362845, |
| "grad_norm": 1.0435632467269897, |
| "learning_rate": 4.006062780959697e-05, |
| "loss": 0.4866, |
| "step": 22480 |
| }, |
| { |
| "epoch": 0.7147622224340037, |
| "grad_norm": 0.7469571828842163, |
| "learning_rate": 3.9896039986775256e-05, |
| "loss": 0.4825, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.7153975666317227, |
| "grad_norm": 0.8732174634933472, |
| "learning_rate": 3.9731706668337585e-05, |
| "loss": 0.4905, |
| "step": 22520 |
| }, |
| { |
| "epoch": 0.7160329108294419, |
| "grad_norm": 0.8761599063873291, |
| "learning_rate": 3.956762855013749e-05, |
| "loss": 0.4831, |
| "step": 22540 |
| }, |
| { |
| "epoch": 0.7166682550271609, |
| "grad_norm": 0.9746137261390686, |
| "learning_rate": 3.940380632694781e-05, |
| "loss": 0.5111, |
| "step": 22560 |
| }, |
| { |
| "epoch": 0.7173035992248801, |
| "grad_norm": 0.9219092726707458, |
| "learning_rate": 3.924024069245782e-05, |
| "loss": 0.4908, |
| "step": 22580 |
| }, |
| { |
| "epoch": 0.7179389434225992, |
| "grad_norm": 1.0305086374282837, |
| "learning_rate": 3.907693233927038e-05, |
| "loss": 0.5215, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.7185742876203183, |
| "grad_norm": 0.7786363363265991, |
| "learning_rate": 3.891388195889882e-05, |
| "loss": 0.4792, |
| "step": 22620 |
| }, |
| { |
| "epoch": 0.7192096318180374, |
| "grad_norm": 0.8930706977844238, |
| "learning_rate": 3.875109024176413e-05, |
| "loss": 0.4908, |
| "step": 22640 |
| }, |
| { |
| "epoch": 0.7198449760157566, |
| "grad_norm": 1.0214048624038696, |
| "learning_rate": 3.858855787719209e-05, |
| "loss": 0.5102, |
| "step": 22660 |
| }, |
| { |
| "epoch": 0.7204803202134756, |
| "grad_norm": 0.9279896020889282, |
| "learning_rate": 3.842628555341018e-05, |
| "loss": 0.4772, |
| "step": 22680 |
| }, |
| { |
| "epoch": 0.7211156644111948, |
| "grad_norm": 1.6357091665267944, |
| "learning_rate": 3.826427395754482e-05, |
| "loss": 0.5041, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.7217510086089138, |
| "grad_norm": 0.8421345949172974, |
| "learning_rate": 3.8102523775618325e-05, |
| "loss": 0.5082, |
| "step": 22720 |
| }, |
| { |
| "epoch": 0.722386352806633, |
| "grad_norm": 0.9193027019500732, |
| "learning_rate": 3.794103569254624e-05, |
| "loss": 0.485, |
| "step": 22740 |
| }, |
| { |
| "epoch": 0.7230216970043521, |
| "grad_norm": 0.8045080304145813, |
| "learning_rate": 3.777981039213411e-05, |
| "loss": 0.5182, |
| "step": 22760 |
| }, |
| { |
| "epoch": 0.7236570412020712, |
| "grad_norm": 0.8535903692245483, |
| "learning_rate": 3.7618848557074804e-05, |
| "loss": 0.4796, |
| "step": 22780 |
| }, |
| { |
| "epoch": 0.7242923853997904, |
| "grad_norm": 0.8225564360618591, |
| "learning_rate": 3.745815086894565e-05, |
| "loss": 0.4812, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.7249277295975094, |
| "grad_norm": 0.8030312657356262, |
| "learning_rate": 3.729771800820539e-05, |
| "loss": 0.481, |
| "step": 22820 |
| }, |
| { |
| "epoch": 0.7255630737952286, |
| "grad_norm": 0.992080569267273, |
| "learning_rate": 3.713755065419133e-05, |
| "loss": 0.4768, |
| "step": 22840 |
| }, |
| { |
| "epoch": 0.7261984179929477, |
| "grad_norm": 0.9184660911560059, |
| "learning_rate": 3.698563821122103e-05, |
| "loss": 0.5044, |
| "step": 22860 |
| }, |
| { |
| "epoch": 0.7268337621906668, |
| "grad_norm": 0.8250758647918701, |
| "learning_rate": 3.6825990545007096e-05, |
| "loss": 0.5095, |
| "step": 22880 |
| }, |
| { |
| "epoch": 0.7274691063883859, |
| "grad_norm": 1.0519983768463135, |
| "learning_rate": 3.666661038300353e-05, |
| "loss": 0.4944, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.7281044505861051, |
| "grad_norm": 0.789730966091156, |
| "learning_rate": 3.650749840009022e-05, |
| "loss": 0.4574, |
| "step": 22920 |
| }, |
| { |
| "epoch": 0.7287397947838241, |
| "grad_norm": 0.8896093368530273, |
| "learning_rate": 3.6356591030872534e-05, |
| "loss": 0.5, |
| "step": 22940 |
| }, |
| { |
| "epoch": 0.7293751389815433, |
| "grad_norm": 0.7810101509094238, |
| "learning_rate": 3.6198003934005195e-05, |
| "loss": 0.5053, |
| "step": 22960 |
| }, |
| { |
| "epoch": 0.7300104831792623, |
| "grad_norm": 0.883144199848175, |
| "learning_rate": 3.603968700049657e-05, |
| "loss": 0.514, |
| "step": 22980 |
| }, |
| { |
| "epoch": 0.7306458273769815, |
| "grad_norm": 0.7069016695022583, |
| "learning_rate": 3.588164090072441e-05, |
| "loss": 0.522, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.7306458273769815, |
| "eval_loss": 0.4499790668487549, |
| "eval_runtime": 45.0673, |
| "eval_samples_per_second": 59.977, |
| "eval_steps_per_second": 30.0, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.7312811715747006, |
| "grad_norm": 1.0385907888412476, |
| "learning_rate": 3.5723866303919554e-05, |
| "loss": 0.489, |
| "step": 23020 |
| }, |
| { |
| "epoch": 0.7319165157724197, |
| "grad_norm": 0.8796695470809937, |
| "learning_rate": 3.556636387816317e-05, |
| "loss": 0.4963, |
| "step": 23040 |
| }, |
| { |
| "epoch": 0.7325518599701388, |
| "grad_norm": 0.9427993893623352, |
| "learning_rate": 3.540913429038407e-05, |
| "loss": 0.4601, |
| "step": 23060 |
| }, |
| { |
| "epoch": 0.733187204167858, |
| "grad_norm": 0.8525741100311279, |
| "learning_rate": 3.525217820635564e-05, |
| "loss": 0.5034, |
| "step": 23080 |
| }, |
| { |
| "epoch": 0.733822548365577, |
| "grad_norm": 0.8755898475646973, |
| "learning_rate": 3.5095496290693155e-05, |
| "loss": 0.509, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.7344578925632962, |
| "grad_norm": 1.0328361988067627, |
| "learning_rate": 3.4939089206851025e-05, |
| "loss": 0.4994, |
| "step": 23120 |
| }, |
| { |
| "epoch": 0.7350932367610152, |
| "grad_norm": 1.130226969718933, |
| "learning_rate": 3.478295761711986e-05, |
| "loss": 0.4848, |
| "step": 23140 |
| }, |
| { |
| "epoch": 0.7357285809587344, |
| "grad_norm": 0.733567476272583, |
| "learning_rate": 3.4627102182623696e-05, |
| "loss": 0.5123, |
| "step": 23160 |
| }, |
| { |
| "epoch": 0.7363639251564535, |
| "grad_norm": 1.1062750816345215, |
| "learning_rate": 3.447152356331721e-05, |
| "loss": 0.4767, |
| "step": 23180 |
| }, |
| { |
| "epoch": 0.7369992693541726, |
| "grad_norm": 0.9558404684066772, |
| "learning_rate": 3.431622241798305e-05, |
| "loss": 0.4832, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.7376346135518917, |
| "grad_norm": 0.8974496722221375, |
| "learning_rate": 3.416119940422877e-05, |
| "loss": 0.4818, |
| "step": 23220 |
| }, |
| { |
| "epoch": 0.7382699577496109, |
| "grad_norm": 1.2721449136734009, |
| "learning_rate": 3.400645517848427e-05, |
| "loss": 0.5102, |
| "step": 23240 |
| }, |
| { |
| "epoch": 0.73890530194733, |
| "grad_norm": 1.0408607721328735, |
| "learning_rate": 3.385199039599902e-05, |
| "loss": 0.4784, |
| "step": 23260 |
| }, |
| { |
| "epoch": 0.7395406461450491, |
| "grad_norm": 0.9826887845993042, |
| "learning_rate": 3.369780571083909e-05, |
| "loss": 0.5039, |
| "step": 23280 |
| }, |
| { |
| "epoch": 0.7401759903427682, |
| "grad_norm": 0.8110315799713135, |
| "learning_rate": 3.354390177588454e-05, |
| "loss": 0.5034, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.7408113345404873, |
| "grad_norm": 0.8513306975364685, |
| "learning_rate": 3.339027924282673e-05, |
| "loss": 0.509, |
| "step": 23320 |
| }, |
| { |
| "epoch": 0.7414466787382065, |
| "grad_norm": 0.8255580067634583, |
| "learning_rate": 3.323693876216529e-05, |
| "loss": 0.4678, |
| "step": 23340 |
| }, |
| { |
| "epoch": 0.7420820229359255, |
| "grad_norm": 1.1336640119552612, |
| "learning_rate": 3.30838809832056e-05, |
| "loss": 0.4848, |
| "step": 23360 |
| }, |
| { |
| "epoch": 0.7427173671336447, |
| "grad_norm": 0.8720375895500183, |
| "learning_rate": 3.2931106554056005e-05, |
| "loss": 0.4929, |
| "step": 23380 |
| }, |
| { |
| "epoch": 0.7433527113313637, |
| "grad_norm": 1.0169090032577515, |
| "learning_rate": 3.277861612162498e-05, |
| "loss": 0.5066, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.7439880555290829, |
| "grad_norm": 1.2800534963607788, |
| "learning_rate": 3.262641033161843e-05, |
| "loss": 0.4964, |
| "step": 23420 |
| }, |
| { |
| "epoch": 0.744623399726802, |
| "grad_norm": 0.819925844669342, |
| "learning_rate": 3.2474489828537046e-05, |
| "loss": 0.509, |
| "step": 23440 |
| }, |
| { |
| "epoch": 0.7452587439245211, |
| "grad_norm": 0.8024299144744873, |
| "learning_rate": 3.232285525567343e-05, |
| "loss": 0.4922, |
| "step": 23460 |
| }, |
| { |
| "epoch": 0.7458940881222402, |
| "grad_norm": 1.1049789190292358, |
| "learning_rate": 3.217150725510946e-05, |
| "loss": 0.4907, |
| "step": 23480 |
| }, |
| { |
| "epoch": 0.7465294323199594, |
| "grad_norm": 1.0818272829055786, |
| "learning_rate": 3.2020446467713516e-05, |
| "loss": 0.4806, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.7471647765176784, |
| "grad_norm": 0.6681995391845703, |
| "learning_rate": 3.18696735331379e-05, |
| "loss": 0.4504, |
| "step": 23520 |
| }, |
| { |
| "epoch": 0.7478001207153976, |
| "grad_norm": 0.8827902674674988, |
| "learning_rate": 3.171918908981595e-05, |
| "loss": 0.5081, |
| "step": 23540 |
| }, |
| { |
| "epoch": 0.7484354649131166, |
| "grad_norm": 1.0249037742614746, |
| "learning_rate": 3.156899377495938e-05, |
| "loss": 0.5297, |
| "step": 23560 |
| }, |
| { |
| "epoch": 0.7490708091108358, |
| "grad_norm": 1.0797147750854492, |
| "learning_rate": 3.141908822455574e-05, |
| "loss": 0.4701, |
| "step": 23580 |
| }, |
| { |
| "epoch": 0.749706153308555, |
| "grad_norm": 0.724281907081604, |
| "learning_rate": 3.126947307336551e-05, |
| "loss": 0.4608, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.750341497506274, |
| "grad_norm": 0.7410632967948914, |
| "learning_rate": 3.1120148954919485e-05, |
| "loss": 0.4747, |
| "step": 23620 |
| }, |
| { |
| "epoch": 0.7509768417039931, |
| "grad_norm": 1.0309559106826782, |
| "learning_rate": 3.09711165015162e-05, |
| "loss": 0.534, |
| "step": 23640 |
| }, |
| { |
| "epoch": 0.7516121859017123, |
| "grad_norm": 0.9060602784156799, |
| "learning_rate": 3.0822376344219105e-05, |
| "loss": 0.4709, |
| "step": 23660 |
| }, |
| { |
| "epoch": 0.7522475300994313, |
| "grad_norm": 0.9018211364746094, |
| "learning_rate": 3.067392911285395e-05, |
| "loss": 0.5084, |
| "step": 23680 |
| }, |
| { |
| "epoch": 0.7528828742971505, |
| "grad_norm": 1.1375420093536377, |
| "learning_rate": 3.0525775436006107e-05, |
| "loss": 0.5023, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.7535182184948696, |
| "grad_norm": 0.8034165501594543, |
| "learning_rate": 3.0377915941017955e-05, |
| "loss": 0.4947, |
| "step": 23720 |
| }, |
| { |
| "epoch": 0.7541535626925887, |
| "grad_norm": 1.0958040952682495, |
| "learning_rate": 3.0230351253986143e-05, |
| "loss": 0.5009, |
| "step": 23740 |
| }, |
| { |
| "epoch": 0.7547889068903079, |
| "grad_norm": 0.8740959763526917, |
| "learning_rate": 3.0083081999759067e-05, |
| "loss": 0.4942, |
| "step": 23760 |
| }, |
| { |
| "epoch": 0.7554242510880269, |
| "grad_norm": 0.8798695206642151, |
| "learning_rate": 2.993610880193406e-05, |
| "loss": 0.4676, |
| "step": 23780 |
| }, |
| { |
| "epoch": 0.7560595952857461, |
| "grad_norm": 0.9538172483444214, |
| "learning_rate": 2.9789432282854822e-05, |
| "loss": 0.4441, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.7566949394834651, |
| "grad_norm": 0.9560829401016235, |
| "learning_rate": 2.9643053063608917e-05, |
| "loss": 0.4995, |
| "step": 23820 |
| }, |
| { |
| "epoch": 0.7573302836811843, |
| "grad_norm": 1.0306763648986816, |
| "learning_rate": 2.9496971764024884e-05, |
| "loss": 0.5042, |
| "step": 23840 |
| }, |
| { |
| "epoch": 0.7579656278789034, |
| "grad_norm": 0.9823128581047058, |
| "learning_rate": 2.9351189002669788e-05, |
| "loss": 0.5274, |
| "step": 23860 |
| }, |
| { |
| "epoch": 0.7586009720766225, |
| "grad_norm": 0.8448672890663147, |
| "learning_rate": 2.920570539684665e-05, |
| "loss": 0.4713, |
| "step": 23880 |
| }, |
| { |
| "epoch": 0.7592363162743416, |
| "grad_norm": 0.8830504417419434, |
| "learning_rate": 2.9060521562591624e-05, |
| "loss": 0.5069, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.7598716604720608, |
| "grad_norm": 0.9051734805107117, |
| "learning_rate": 2.891563811467154e-05, |
| "loss": 0.48, |
| "step": 23920 |
| }, |
| { |
| "epoch": 0.7605070046697798, |
| "grad_norm": 0.8309674859046936, |
| "learning_rate": 2.877105566658136e-05, |
| "loss": 0.5141, |
| "step": 23940 |
| }, |
| { |
| "epoch": 0.761142348867499, |
| "grad_norm": 0.8684896230697632, |
| "learning_rate": 2.863398169962057e-05, |
| "loss": 0.4518, |
| "step": 23960 |
| }, |
| { |
| "epoch": 0.761777693065218, |
| "grad_norm": 0.959536075592041, |
| "learning_rate": 2.8489987960934184e-05, |
| "loss": 0.483, |
| "step": 23980 |
| }, |
| { |
| "epoch": 0.7624130372629372, |
| "grad_norm": 1.3519070148468018, |
| "learning_rate": 2.8353474370325594e-05, |
| "loss": 0.5062, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.7624130372629372, |
| "eval_loss": 0.4479082524776459, |
| "eval_runtime": 44.6533, |
| "eval_samples_per_second": 60.533, |
| "eval_steps_per_second": 30.278, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.7630483814606563, |
| "grad_norm": 0.8832095861434937, |
| "learning_rate": 2.8210071659529526e-05, |
| "loss": 0.5204, |
| "step": 24020 |
| }, |
| { |
| "epoch": 0.7636837256583754, |
| "grad_norm": 0.793205738067627, |
| "learning_rate": 2.8066972936216017e-05, |
| "loss": 0.5037, |
| "step": 24040 |
| }, |
| { |
| "epoch": 0.7643190698560945, |
| "grad_norm": 0.8483644127845764, |
| "learning_rate": 2.79241788063227e-05, |
| "loss": 0.4812, |
| "step": 24060 |
| }, |
| { |
| "epoch": 0.7649544140538137, |
| "grad_norm": 1.50220787525177, |
| "learning_rate": 2.7781689874497406e-05, |
| "loss": 0.501, |
| "step": 24080 |
| }, |
| { |
| "epoch": 0.7655897582515327, |
| "grad_norm": 0.8091638684272766, |
| "learning_rate": 2.7639506744095766e-05, |
| "loss": 0.4932, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.7662251024492519, |
| "grad_norm": 0.9171321392059326, |
| "learning_rate": 2.74976300171784e-05, |
| "loss": 0.5, |
| "step": 24120 |
| }, |
| { |
| "epoch": 0.766860446646971, |
| "grad_norm": 0.9392116069793701, |
| "learning_rate": 2.7356060294508502e-05, |
| "loss": 0.5075, |
| "step": 24140 |
| }, |
| { |
| "epoch": 0.7674957908446901, |
| "grad_norm": 0.9384047389030457, |
| "learning_rate": 2.7214798175549395e-05, |
| "loss": 0.4893, |
| "step": 24160 |
| }, |
| { |
| "epoch": 0.7681311350424093, |
| "grad_norm": 0.7760775685310364, |
| "learning_rate": 2.707384425846178e-05, |
| "loss": 0.5267, |
| "step": 24180 |
| }, |
| { |
| "epoch": 0.7687664792401283, |
| "grad_norm": 0.8666489720344543, |
| "learning_rate": 2.6933199140101285e-05, |
| "loss": 0.5201, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.7694018234378475, |
| "grad_norm": 0.9711599946022034, |
| "learning_rate": 2.679286341601609e-05, |
| "loss": 0.4923, |
| "step": 24220 |
| }, |
| { |
| "epoch": 0.7700371676355666, |
| "grad_norm": 0.9399335980415344, |
| "learning_rate": 2.6652837680444153e-05, |
| "loss": 0.5281, |
| "step": 24240 |
| }, |
| { |
| "epoch": 0.7706725118332857, |
| "grad_norm": 0.8116670250892639, |
| "learning_rate": 2.651312252631083e-05, |
| "loss": 0.5111, |
| "step": 24260 |
| }, |
| { |
| "epoch": 0.7713078560310048, |
| "grad_norm": 0.873943030834198, |
| "learning_rate": 2.6373718545226445e-05, |
| "loss": 0.471, |
| "step": 24280 |
| }, |
| { |
| "epoch": 0.7719432002287239, |
| "grad_norm": 0.9560205340385437, |
| "learning_rate": 2.623462632748359e-05, |
| "loss": 0.5101, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.772578544426443, |
| "grad_norm": 1.011898159980774, |
| "learning_rate": 2.6095846462054763e-05, |
| "loss": 0.4906, |
| "step": 24320 |
| }, |
| { |
| "epoch": 0.7732138886241622, |
| "grad_norm": 1.0334892272949219, |
| "learning_rate": 2.595737953658982e-05, |
| "loss": 0.4905, |
| "step": 24340 |
| }, |
| { |
| "epoch": 0.7738492328218812, |
| "grad_norm": 0.6994766592979431, |
| "learning_rate": 2.581922613741352e-05, |
| "loss": 0.4794, |
| "step": 24360 |
| }, |
| { |
| "epoch": 0.7744845770196004, |
| "grad_norm": 0.9781257510185242, |
| "learning_rate": 2.5681386849523003e-05, |
| "loss": 0.4871, |
| "step": 24380 |
| }, |
| { |
| "epoch": 0.7751199212173194, |
| "grad_norm": 1.0443729162216187, |
| "learning_rate": 2.5543862256585393e-05, |
| "loss": 0.5133, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.7757552654150386, |
| "grad_norm": 0.8841618299484253, |
| "learning_rate": 2.5406652940935217e-05, |
| "loss": 0.4865, |
| "step": 24420 |
| }, |
| { |
| "epoch": 0.7763906096127577, |
| "grad_norm": 0.8439558148384094, |
| "learning_rate": 2.5269759483571954e-05, |
| "loss": 0.4908, |
| "step": 24440 |
| }, |
| { |
| "epoch": 0.7770259538104768, |
| "grad_norm": 0.9146759510040283, |
| "learning_rate": 2.5133182464157734e-05, |
| "loss": 0.4934, |
| "step": 24460 |
| }, |
| { |
| "epoch": 0.777661298008196, |
| "grad_norm": 0.7785593867301941, |
| "learning_rate": 2.499692246101466e-05, |
| "loss": 0.4857, |
| "step": 24480 |
| }, |
| { |
| "epoch": 0.7782966422059151, |
| "grad_norm": 0.9240188002586365, |
| "learning_rate": 2.4860980051122474e-05, |
| "loss": 0.4958, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.7789319864036341, |
| "grad_norm": 1.0593191385269165, |
| "learning_rate": 2.4725355810116103e-05, |
| "loss": 0.5077, |
| "step": 24520 |
| }, |
| { |
| "epoch": 0.7795673306013533, |
| "grad_norm": 0.8705240488052368, |
| "learning_rate": 2.4590050312283263e-05, |
| "loss": 0.4792, |
| "step": 24540 |
| }, |
| { |
| "epoch": 0.7802026747990723, |
| "grad_norm": 0.8610863089561462, |
| "learning_rate": 2.4455064130561944e-05, |
| "loss": 0.4949, |
| "step": 24560 |
| }, |
| { |
| "epoch": 0.7808380189967915, |
| "grad_norm": 1.152521014213562, |
| "learning_rate": 2.432039783653799e-05, |
| "loss": 0.5076, |
| "step": 24580 |
| }, |
| { |
| "epoch": 0.7814733631945107, |
| "grad_norm": 0.8608033657073975, |
| "learning_rate": 2.4186052000442806e-05, |
| "loss": 0.4759, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.7821087073922297, |
| "grad_norm": 1.1664726734161377, |
| "learning_rate": 2.4052027191150762e-05, |
| "loss": 0.4941, |
| "step": 24620 |
| }, |
| { |
| "epoch": 0.7827440515899489, |
| "grad_norm": 0.8805221915245056, |
| "learning_rate": 2.3918323976176883e-05, |
| "loss": 0.4797, |
| "step": 24640 |
| }, |
| { |
| "epoch": 0.783379395787668, |
| "grad_norm": 0.7699743509292603, |
| "learning_rate": 2.3784942921674512e-05, |
| "loss": 0.4903, |
| "step": 24660 |
| }, |
| { |
| "epoch": 0.7840147399853871, |
| "grad_norm": 0.9498074650764465, |
| "learning_rate": 2.365188459243274e-05, |
| "loss": 0.4679, |
| "step": 24680 |
| }, |
| { |
| "epoch": 0.7846500841831062, |
| "grad_norm": 0.815447986125946, |
| "learning_rate": 2.351914955187412e-05, |
| "loss": 0.5114, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.7852854283808253, |
| "grad_norm": 0.984866738319397, |
| "learning_rate": 2.3386738362052353e-05, |
| "loss": 0.4725, |
| "step": 24720 |
| }, |
| { |
| "epoch": 0.7859207725785444, |
| "grad_norm": 1.0802818536758423, |
| "learning_rate": 2.3254651583649735e-05, |
| "loss": 0.4684, |
| "step": 24740 |
| }, |
| { |
| "epoch": 0.7865561167762636, |
| "grad_norm": 0.8058573007583618, |
| "learning_rate": 2.3122889775974887e-05, |
| "loss": 0.4847, |
| "step": 24760 |
| }, |
| { |
| "epoch": 0.7871914609739826, |
| "grad_norm": 0.8836669921875, |
| "learning_rate": 2.2991453496960447e-05, |
| "loss": 0.4859, |
| "step": 24780 |
| }, |
| { |
| "epoch": 0.7878268051717018, |
| "grad_norm": 0.7214009165763855, |
| "learning_rate": 2.2860343303160535e-05, |
| "loss": 0.4816, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.7884621493694208, |
| "grad_norm": 0.8268193006515503, |
| "learning_rate": 2.2729559749748575e-05, |
| "loss": 0.4674, |
| "step": 24820 |
| }, |
| { |
| "epoch": 0.78909749356714, |
| "grad_norm": 0.7158612608909607, |
| "learning_rate": 2.2599103390514766e-05, |
| "loss": 0.465, |
| "step": 24840 |
| }, |
| { |
| "epoch": 0.7897328377648591, |
| "grad_norm": 0.8904339671134949, |
| "learning_rate": 2.246897477786396e-05, |
| "loss": 0.5024, |
| "step": 24860 |
| }, |
| { |
| "epoch": 0.7903681819625782, |
| "grad_norm": 0.8315703272819519, |
| "learning_rate": 2.2339174462813127e-05, |
| "loss": 0.4609, |
| "step": 24880 |
| }, |
| { |
| "epoch": 0.7910035261602973, |
| "grad_norm": 0.8962224721908569, |
| "learning_rate": 2.2209702994989045e-05, |
| "loss": 0.4906, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.7916388703580165, |
| "grad_norm": 0.9301977753639221, |
| "learning_rate": 2.208056092262616e-05, |
| "loss": 0.5216, |
| "step": 24920 |
| }, |
| { |
| "epoch": 0.7922742145557355, |
| "grad_norm": 0.8634437918663025, |
| "learning_rate": 2.1951748792563985e-05, |
| "loss": 0.5031, |
| "step": 24940 |
| }, |
| { |
| "epoch": 0.7929095587534547, |
| "grad_norm": 0.8985020518302917, |
| "learning_rate": 2.1823267150244964e-05, |
| "loss": 0.4709, |
| "step": 24960 |
| }, |
| { |
| "epoch": 0.7935449029511737, |
| "grad_norm": 1.1470792293548584, |
| "learning_rate": 2.16951165397122e-05, |
| "loss": 0.5224, |
| "step": 24980 |
| }, |
| { |
| "epoch": 0.7941802471488929, |
| "grad_norm": 0.919326663017273, |
| "learning_rate": 2.1567297503606987e-05, |
| "loss": 0.5004, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.7941802471488929, |
| "eval_loss": 0.44602036476135254, |
| "eval_runtime": 44.8391, |
| "eval_samples_per_second": 60.282, |
| "eval_steps_per_second": 30.152, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.7948155913466121, |
| "grad_norm": 1.1010879278182983, |
| "learning_rate": 2.1439810583166587e-05, |
| "loss": 0.5077, |
| "step": 25020 |
| }, |
| { |
| "epoch": 0.7954509355443311, |
| "grad_norm": 0.8573036789894104, |
| "learning_rate": 2.131900612258364e-05, |
| "loss": 0.4973, |
| "step": 25040 |
| }, |
| { |
| "epoch": 0.7960862797420503, |
| "grad_norm": 0.8931069374084473, |
| "learning_rate": 2.1198502345256165e-05, |
| "loss": 0.4972, |
| "step": 25060 |
| }, |
| { |
| "epoch": 0.7967216239397694, |
| "grad_norm": 1.239161491394043, |
| "learning_rate": 2.107198160794136e-05, |
| "loss": 0.4981, |
| "step": 25080 |
| }, |
| { |
| "epoch": 0.7973569681374885, |
| "grad_norm": 0.9950107336044312, |
| "learning_rate": 2.0945795083658447e-05, |
| "loss": 0.506, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.7979923123352076, |
| "grad_norm": 0.7783673405647278, |
| "learning_rate": 2.0819943306732082e-05, |
| "loss": 0.4763, |
| "step": 25120 |
| }, |
| { |
| "epoch": 0.7986276565329267, |
| "grad_norm": 0.912331223487854, |
| "learning_rate": 2.0694426810069345e-05, |
| "loss": 0.4622, |
| "step": 25140 |
| }, |
| { |
| "epoch": 0.7992630007306458, |
| "grad_norm": 0.8284201622009277, |
| "learning_rate": 2.0569246125157658e-05, |
| "loss": 0.513, |
| "step": 25160 |
| }, |
| { |
| "epoch": 0.799898344928365, |
| "grad_norm": 1.1468638181686401, |
| "learning_rate": 2.0444401782062518e-05, |
| "loss": 0.4719, |
| "step": 25180 |
| }, |
| { |
| "epoch": 0.800533689126084, |
| "grad_norm": 1.0985773801803589, |
| "learning_rate": 2.0319894309425146e-05, |
| "loss": 0.4871, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.8011690333238032, |
| "grad_norm": 1.1010768413543701, |
| "learning_rate": 2.0195724234460322e-05, |
| "loss": 0.5459, |
| "step": 25220 |
| }, |
| { |
| "epoch": 0.8018043775215223, |
| "grad_norm": 0.9938257336616516, |
| "learning_rate": 2.0071892082954248e-05, |
| "loss": 0.5127, |
| "step": 25240 |
| }, |
| { |
| "epoch": 0.8024397217192414, |
| "grad_norm": 1.1338539123535156, |
| "learning_rate": 1.9954565018232684e-05, |
| "loss": 0.4838, |
| "step": 25260 |
| }, |
| { |
| "epoch": 0.8030750659169605, |
| "grad_norm": 0.7955858111381531, |
| "learning_rate": 1.9831393324342518e-05, |
| "loss": 0.4865, |
| "step": 25280 |
| }, |
| { |
| "epoch": 0.8037104101146796, |
| "grad_norm": 1.0443702936172485, |
| "learning_rate": 1.9708561096634902e-05, |
| "loss": 0.4749, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.8043457543123987, |
| "grad_norm": 1.0816038846969604, |
| "learning_rate": 1.958606885523103e-05, |
| "loss": 0.5142, |
| "step": 25320 |
| }, |
| { |
| "epoch": 0.8049810985101179, |
| "grad_norm": 1.2127019166946411, |
| "learning_rate": 1.946391711881239e-05, |
| "loss": 0.4831, |
| "step": 25340 |
| }, |
| { |
| "epoch": 0.8056164427078369, |
| "grad_norm": 0.8780348300933838, |
| "learning_rate": 1.9342106404618632e-05, |
| "loss": 0.5113, |
| "step": 25360 |
| }, |
| { |
| "epoch": 0.8062517869055561, |
| "grad_norm": 0.7795581221580505, |
| "learning_rate": 1.9220637228445438e-05, |
| "loss": 0.4721, |
| "step": 25380 |
| }, |
| { |
| "epoch": 0.8068871311032751, |
| "grad_norm": 0.9518604874610901, |
| "learning_rate": 1.9099510104642216e-05, |
| "loss": 0.4754, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.8075224753009943, |
| "grad_norm": 1.0051589012145996, |
| "learning_rate": 1.8978725546110022e-05, |
| "loss": 0.4936, |
| "step": 25420 |
| }, |
| { |
| "epoch": 0.8081578194987135, |
| "grad_norm": 0.8047780394554138, |
| "learning_rate": 1.8858284064299326e-05, |
| "loss": 0.4901, |
| "step": 25440 |
| }, |
| { |
| "epoch": 0.8087931636964325, |
| "grad_norm": 1.1246352195739746, |
| "learning_rate": 1.8738186169207917e-05, |
| "loss": 0.5117, |
| "step": 25460 |
| }, |
| { |
| "epoch": 0.8094285078941517, |
| "grad_norm": 0.8150719404220581, |
| "learning_rate": 1.861843236937867e-05, |
| "loss": 0.4685, |
| "step": 25480 |
| }, |
| { |
| "epoch": 0.8100638520918708, |
| "grad_norm": 2.195882558822632, |
| "learning_rate": 1.8499023171897388e-05, |
| "loss": 0.471, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.8106991962895899, |
| "grad_norm": 0.8962704539299011, |
| "learning_rate": 1.8379959082390798e-05, |
| "loss": 0.481, |
| "step": 25520 |
| }, |
| { |
| "epoch": 0.811334540487309, |
| "grad_norm": 0.8531712889671326, |
| "learning_rate": 1.8261240605024165e-05, |
| "loss": 0.4881, |
| "step": 25540 |
| }, |
| { |
| "epoch": 0.8119698846850281, |
| "grad_norm": 0.9354826807975769, |
| "learning_rate": 1.8142868242499368e-05, |
| "loss": 0.4761, |
| "step": 25560 |
| }, |
| { |
| "epoch": 0.8126052288827472, |
| "grad_norm": 1.0048118829727173, |
| "learning_rate": 1.8024842496052708e-05, |
| "loss": 0.4968, |
| "step": 25580 |
| }, |
| { |
| "epoch": 0.8132405730804664, |
| "grad_norm": 0.8254916071891785, |
| "learning_rate": 1.790716386545275e-05, |
| "loss": 0.5076, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.8138759172781854, |
| "grad_norm": 0.9708372950553894, |
| "learning_rate": 1.778983284899819e-05, |
| "loss": 0.5197, |
| "step": 25620 |
| }, |
| { |
| "epoch": 0.8145112614759046, |
| "grad_norm": 0.9034101366996765, |
| "learning_rate": 1.767284994351588e-05, |
| "loss": 0.4954, |
| "step": 25640 |
| }, |
| { |
| "epoch": 0.8151466056736237, |
| "grad_norm": 1.3567668199539185, |
| "learning_rate": 1.7556215644358564e-05, |
| "loss": 0.5133, |
| "step": 25660 |
| }, |
| { |
| "epoch": 0.8157819498713428, |
| "grad_norm": 0.9000421166419983, |
| "learning_rate": 1.743993044540282e-05, |
| "loss": 0.524, |
| "step": 25680 |
| }, |
| { |
| "epoch": 0.8164172940690619, |
| "grad_norm": 0.7230278849601746, |
| "learning_rate": 1.7323994839047086e-05, |
| "loss": 0.4831, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.817052638266781, |
| "grad_norm": 0.8648797273635864, |
| "learning_rate": 1.7208409316209407e-05, |
| "loss": 0.4932, |
| "step": 25720 |
| }, |
| { |
| "epoch": 0.8176879824645001, |
| "grad_norm": 0.9017996788024902, |
| "learning_rate": 1.709317436632547e-05, |
| "loss": 0.4787, |
| "step": 25740 |
| }, |
| { |
| "epoch": 0.8183233266622193, |
| "grad_norm": 0.9122520685195923, |
| "learning_rate": 1.697829047734646e-05, |
| "loss": 0.4721, |
| "step": 25760 |
| }, |
| { |
| "epoch": 0.8189586708599383, |
| "grad_norm": 0.9448441863059998, |
| "learning_rate": 1.6863758135737085e-05, |
| "loss": 0.4772, |
| "step": 25780 |
| }, |
| { |
| "epoch": 0.8195940150576575, |
| "grad_norm": 1.052437424659729, |
| "learning_rate": 1.6749577826473405e-05, |
| "loss": 0.5252, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.8202293592553767, |
| "grad_norm": 0.9826536774635315, |
| "learning_rate": 1.6635750033040842e-05, |
| "loss": 0.5187, |
| "step": 25820 |
| }, |
| { |
| "epoch": 0.8208647034530957, |
| "grad_norm": 0.8498765826225281, |
| "learning_rate": 1.6522275237432193e-05, |
| "loss": 0.4792, |
| "step": 25840 |
| }, |
| { |
| "epoch": 0.8215000476508149, |
| "grad_norm": 0.9139013886451721, |
| "learning_rate": 1.6409153920145416e-05, |
| "loss": 0.5006, |
| "step": 25860 |
| }, |
| { |
| "epoch": 0.8221353918485339, |
| "grad_norm": 0.9082590937614441, |
| "learning_rate": 1.6296386560181744e-05, |
| "loss": 0.4801, |
| "step": 25880 |
| }, |
| { |
| "epoch": 0.8227707360462531, |
| "grad_norm": 0.8360690474510193, |
| "learning_rate": 1.618397363504366e-05, |
| "loss": 0.491, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.8234060802439722, |
| "grad_norm": 0.8585413098335266, |
| "learning_rate": 1.6071915620732746e-05, |
| "loss": 0.4952, |
| "step": 25920 |
| }, |
| { |
| "epoch": 0.8240414244416913, |
| "grad_norm": 0.9051182866096497, |
| "learning_rate": 1.5960212991747804e-05, |
| "loss": 0.5021, |
| "step": 25940 |
| }, |
| { |
| "epoch": 0.8246767686394104, |
| "grad_norm": 1.1850552558898926, |
| "learning_rate": 1.584886622108276e-05, |
| "loss": 0.5194, |
| "step": 25960 |
| }, |
| { |
| "epoch": 0.8253121128371295, |
| "grad_norm": 0.8449670672416687, |
| "learning_rate": 1.57378757802247e-05, |
| "loss": 0.4988, |
| "step": 25980 |
| }, |
| { |
| "epoch": 0.8259474570348486, |
| "grad_norm": 0.9663527607917786, |
| "learning_rate": 1.5627242139151867e-05, |
| "loss": 0.4782, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.8259474570348486, |
| "eval_loss": 0.44560423493385315, |
| "eval_runtime": 45.0247, |
| "eval_samples_per_second": 60.034, |
| "eval_steps_per_second": 30.028, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.8265828012325678, |
| "grad_norm": 1.0954176187515259, |
| "learning_rate": 1.5516965766331715e-05, |
| "loss": 0.4992, |
| "step": 26020 |
| }, |
| { |
| "epoch": 0.8272181454302868, |
| "grad_norm": 0.9752370119094849, |
| "learning_rate": 1.540704712871881e-05, |
| "loss": 0.5109, |
| "step": 26040 |
| }, |
| { |
| "epoch": 0.827853489628006, |
| "grad_norm": 0.7089188098907471, |
| "learning_rate": 1.5297486691752928e-05, |
| "loss": 0.4669, |
| "step": 26060 |
| }, |
| { |
| "epoch": 0.8284888338257251, |
| "grad_norm": 0.8641648292541504, |
| "learning_rate": 1.5188284919357155e-05, |
| "loss": 0.4905, |
| "step": 26080 |
| }, |
| { |
| "epoch": 0.8291241780234442, |
| "grad_norm": 0.8167259097099304, |
| "learning_rate": 1.5079442273935773e-05, |
| "loss": 0.4776, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.8297595222211633, |
| "grad_norm": 0.9287614226341248, |
| "learning_rate": 1.4970959216372372e-05, |
| "loss": 0.4803, |
| "step": 26120 |
| }, |
| { |
| "epoch": 0.8303948664188824, |
| "grad_norm": 0.8652564883232117, |
| "learning_rate": 1.4862836206027975e-05, |
| "loss": 0.4623, |
| "step": 26140 |
| }, |
| { |
| "epoch": 0.8310302106166015, |
| "grad_norm": 0.9141151309013367, |
| "learning_rate": 1.4755073700738953e-05, |
| "loss": 0.507, |
| "step": 26160 |
| }, |
| { |
| "epoch": 0.8316655548143207, |
| "grad_norm": 0.9454159736633301, |
| "learning_rate": 1.464767215681515e-05, |
| "loss": 0.5218, |
| "step": 26180 |
| }, |
| { |
| "epoch": 0.8323008990120397, |
| "grad_norm": 0.7766212821006775, |
| "learning_rate": 1.4540632029038026e-05, |
| "loss": 0.5294, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.8329362432097589, |
| "grad_norm": 0.8662501573562622, |
| "learning_rate": 1.443395377065858e-05, |
| "loss": 0.4931, |
| "step": 26220 |
| }, |
| { |
| "epoch": 0.833571587407478, |
| "grad_norm": 1.0195443630218506, |
| "learning_rate": 1.4327637833395525e-05, |
| "loss": 0.5165, |
| "step": 26240 |
| }, |
| { |
| "epoch": 0.8342069316051971, |
| "grad_norm": 0.9022318124771118, |
| "learning_rate": 1.422168466743341e-05, |
| "loss": 0.4732, |
| "step": 26260 |
| }, |
| { |
| "epoch": 0.8348422758029163, |
| "grad_norm": 0.9162563681602478, |
| "learning_rate": 1.4116094721420625e-05, |
| "loss": 0.496, |
| "step": 26280 |
| }, |
| { |
| "epoch": 0.8354776200006353, |
| "grad_norm": 1.129158854484558, |
| "learning_rate": 1.401086844246755e-05, |
| "loss": 0.4764, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.8361129641983545, |
| "grad_norm": 0.8695496320724487, |
| "learning_rate": 1.3906006276144601e-05, |
| "loss": 0.4852, |
| "step": 26320 |
| }, |
| { |
| "epoch": 0.8367483083960736, |
| "grad_norm": 1.7362381219863892, |
| "learning_rate": 1.3801508666480512e-05, |
| "loss": 0.4642, |
| "step": 26340 |
| }, |
| { |
| "epoch": 0.8373836525937927, |
| "grad_norm": 0.7645226716995239, |
| "learning_rate": 1.369737605596022e-05, |
| "loss": 0.503, |
| "step": 26360 |
| }, |
| { |
| "epoch": 0.8380189967915118, |
| "grad_norm": 0.8403562903404236, |
| "learning_rate": 1.3593608885523158e-05, |
| "loss": 0.4766, |
| "step": 26380 |
| }, |
| { |
| "epoch": 0.8386543409892309, |
| "grad_norm": 0.7841979265213013, |
| "learning_rate": 1.3490207594561366e-05, |
| "loss": 0.4917, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.83928968518695, |
| "grad_norm": 0.8631531000137329, |
| "learning_rate": 1.3392315662821897e-05, |
| "loss": 0.4972, |
| "step": 26420 |
| }, |
| { |
| "epoch": 0.8399250293846692, |
| "grad_norm": 1.0436699390411377, |
| "learning_rate": 1.3289629094769217e-05, |
| "loss": 0.4847, |
| "step": 26440 |
| }, |
| { |
| "epoch": 0.8405603735823882, |
| "grad_norm": 0.9521028399467468, |
| "learning_rate": 1.318730969336468e-05, |
| "loss": 0.4972, |
| "step": 26460 |
| }, |
| { |
| "epoch": 0.8411957177801074, |
| "grad_norm": 0.9861098527908325, |
| "learning_rate": 1.3085357891869909e-05, |
| "loss": 0.5114, |
| "step": 26480 |
| }, |
| { |
| "epoch": 0.8418310619778265, |
| "grad_norm": 1.3008265495300293, |
| "learning_rate": 1.2983774121989888e-05, |
| "loss": 0.5071, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.8424664061755456, |
| "grad_norm": 0.7970487475395203, |
| "learning_rate": 1.2882558813871204e-05, |
| "loss": 0.4945, |
| "step": 26520 |
| }, |
| { |
| "epoch": 0.8431017503732647, |
| "grad_norm": 0.7304345369338989, |
| "learning_rate": 1.2781712396100287e-05, |
| "loss": 0.4902, |
| "step": 26540 |
| }, |
| { |
| "epoch": 0.8437370945709838, |
| "grad_norm": 0.9716693162918091, |
| "learning_rate": 1.2681235295701488e-05, |
| "loss": 0.4857, |
| "step": 26560 |
| }, |
| { |
| "epoch": 0.8443724387687029, |
| "grad_norm": 0.9461120963096619, |
| "learning_rate": 1.2581127938135328e-05, |
| "loss": 0.5139, |
| "step": 26580 |
| }, |
| { |
| "epoch": 0.8450077829664221, |
| "grad_norm": 0.8130011558532715, |
| "learning_rate": 1.2481390747296717e-05, |
| "loss": 0.4788, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.8456431271641411, |
| "grad_norm": 0.959818959236145, |
| "learning_rate": 1.2382024145513094e-05, |
| "loss": 0.4808, |
| "step": 26620 |
| }, |
| { |
| "epoch": 0.8462784713618603, |
| "grad_norm": 1.2069573402404785, |
| "learning_rate": 1.2283028553542674e-05, |
| "loss": 0.4692, |
| "step": 26640 |
| }, |
| { |
| "epoch": 0.8469138155595795, |
| "grad_norm": 1.0251085758209229, |
| "learning_rate": 1.2184404390572712e-05, |
| "loss": 0.5106, |
| "step": 26660 |
| }, |
| { |
| "epoch": 0.8475491597572985, |
| "grad_norm": 0.9423872828483582, |
| "learning_rate": 1.2086152074217638e-05, |
| "loss": 0.4881, |
| "step": 26680 |
| }, |
| { |
| "epoch": 0.8481845039550177, |
| "grad_norm": 0.8245638608932495, |
| "learning_rate": 1.1988272020517322e-05, |
| "loss": 0.4606, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.8488198481527367, |
| "grad_norm": 1.0099587440490723, |
| "learning_rate": 1.1890764643935393e-05, |
| "loss": 0.4976, |
| "step": 26720 |
| }, |
| { |
| "epoch": 0.8494551923504559, |
| "grad_norm": 0.8285634517669678, |
| "learning_rate": 1.1793630357357355e-05, |
| "loss": 0.5057, |
| "step": 26740 |
| }, |
| { |
| "epoch": 0.850090536548175, |
| "grad_norm": 0.9125322699546814, |
| "learning_rate": 1.169686957208892e-05, |
| "loss": 0.4856, |
| "step": 26760 |
| }, |
| { |
| "epoch": 0.8507258807458941, |
| "grad_norm": 1.1413007974624634, |
| "learning_rate": 1.1600482697854198e-05, |
| "loss": 0.4916, |
| "step": 26780 |
| }, |
| { |
| "epoch": 0.8513612249436132, |
| "grad_norm": 0.9246459603309631, |
| "learning_rate": 1.1504470142794121e-05, |
| "loss": 0.4807, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.8519965691413324, |
| "grad_norm": 0.9050401449203491, |
| "learning_rate": 1.140883231346449e-05, |
| "loss": 0.4844, |
| "step": 26820 |
| }, |
| { |
| "epoch": 0.8526319133390514, |
| "grad_norm": 0.8217797875404358, |
| "learning_rate": 1.1313569614834408e-05, |
| "loss": 0.4751, |
| "step": 26840 |
| }, |
| { |
| "epoch": 0.8532672575367706, |
| "grad_norm": 1.0189076662063599, |
| "learning_rate": 1.1218682450284545e-05, |
| "loss": 0.4949, |
| "step": 26860 |
| }, |
| { |
| "epoch": 0.8539026017344896, |
| "grad_norm": 0.7574889659881592, |
| "learning_rate": 1.112417122160535e-05, |
| "loss": 0.4738, |
| "step": 26880 |
| }, |
| { |
| "epoch": 0.8545379459322088, |
| "grad_norm": 0.6649676561355591, |
| "learning_rate": 1.1030036328995497e-05, |
| "loss": 0.4859, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.8551732901299279, |
| "grad_norm": 0.7144981622695923, |
| "learning_rate": 1.0936278171060032e-05, |
| "loss": 0.4799, |
| "step": 26920 |
| }, |
| { |
| "epoch": 0.855808634327647, |
| "grad_norm": 0.9074038863182068, |
| "learning_rate": 1.0842897144808762e-05, |
| "loss": 0.4951, |
| "step": 26940 |
| }, |
| { |
| "epoch": 0.8564439785253661, |
| "grad_norm": 0.9271389842033386, |
| "learning_rate": 1.0749893645654551e-05, |
| "loss": 0.4692, |
| "step": 26960 |
| }, |
| { |
| "epoch": 0.8570793227230852, |
| "grad_norm": 0.9277658462524414, |
| "learning_rate": 1.0657268067411752e-05, |
| "loss": 0.4711, |
| "step": 26980 |
| }, |
| { |
| "epoch": 0.8577146669208043, |
| "grad_norm": 1.5766148567199707, |
| "learning_rate": 1.0565020802294357e-05, |
| "loss": 0.5081, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.8577146669208043, |
| "eval_loss": 0.4444785416126251, |
| "eval_runtime": 45.2678, |
| "eval_samples_per_second": 59.711, |
| "eval_steps_per_second": 29.867, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.8583500111185235, |
| "grad_norm": 0.7567349076271057, |
| "learning_rate": 1.0473152240914419e-05, |
| "loss": 0.4671, |
| "step": 27020 |
| }, |
| { |
| "epoch": 0.8589853553162425, |
| "grad_norm": 1.0230178833007812, |
| "learning_rate": 1.0381662772280498e-05, |
| "loss": 0.4874, |
| "step": 27040 |
| }, |
| { |
| "epoch": 0.8596206995139617, |
| "grad_norm": 0.7454288005828857, |
| "learning_rate": 1.0290552783795849e-05, |
| "loss": 0.4825, |
| "step": 27060 |
| }, |
| { |
| "epoch": 0.8602560437116809, |
| "grad_norm": 0.9813241958618164, |
| "learning_rate": 1.0199822661256852e-05, |
| "loss": 0.4785, |
| "step": 27080 |
| }, |
| { |
| "epoch": 0.8608913879093999, |
| "grad_norm": 0.8269158005714417, |
| "learning_rate": 1.0109472788851427e-05, |
| "loss": 0.4797, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.861526732107119, |
| "grad_norm": 0.8101191520690918, |
| "learning_rate": 1.001950354915734e-05, |
| "loss": 0.4735, |
| "step": 27120 |
| }, |
| { |
| "epoch": 0.8621620763048381, |
| "grad_norm": 0.903421938419342, |
| "learning_rate": 9.929915323140571e-06, |
| "loss": 0.5, |
| "step": 27140 |
| }, |
| { |
| "epoch": 0.8627974205025573, |
| "grad_norm": 0.7358487248420715, |
| "learning_rate": 9.840708490153817e-06, |
| "loss": 0.4799, |
| "step": 27160 |
| }, |
| { |
| "epoch": 0.8634327647002764, |
| "grad_norm": 0.9838561415672302, |
| "learning_rate": 9.751883427934717e-06, |
| "loss": 0.506, |
| "step": 27180 |
| }, |
| { |
| "epoch": 0.8640681088979955, |
| "grad_norm": 0.9448813796043396, |
| "learning_rate": 9.66344051260436e-06, |
| "loss": 0.4966, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.8647034530957146, |
| "grad_norm": 1.111055612564087, |
| "learning_rate": 9.575380118665733e-06, |
| "loss": 0.5118, |
| "step": 27220 |
| }, |
| { |
| "epoch": 0.8653387972934338, |
| "grad_norm": 0.968305230140686, |
| "learning_rate": 9.487702619001992e-06, |
| "loss": 0.5002, |
| "step": 27240 |
| }, |
| { |
| "epoch": 0.8659741414911528, |
| "grad_norm": 0.8771995902061462, |
| "learning_rate": 9.400408384874992e-06, |
| "loss": 0.497, |
| "step": 27260 |
| }, |
| { |
| "epoch": 0.866609485688872, |
| "grad_norm": 1.0422018766403198, |
| "learning_rate": 9.31349778592373e-06, |
| "loss": 0.5081, |
| "step": 27280 |
| }, |
| { |
| "epoch": 0.867244829886591, |
| "grad_norm": 0.8950514197349548, |
| "learning_rate": 9.22697119016267e-06, |
| "loss": 0.4957, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.8678801740843102, |
| "grad_norm": 0.8093190789222717, |
| "learning_rate": 9.140828963980297e-06, |
| "loss": 0.4667, |
| "step": 27320 |
| }, |
| { |
| "epoch": 0.8685155182820293, |
| "grad_norm": 0.8465502262115479, |
| "learning_rate": 9.055071472137466e-06, |
| "loss": 0.4913, |
| "step": 27340 |
| }, |
| { |
| "epoch": 0.8691508624797484, |
| "grad_norm": 0.8349893093109131, |
| "learning_rate": 8.969699077766014e-06, |
| "loss": 0.4738, |
| "step": 27360 |
| }, |
| { |
| "epoch": 0.8697862066774675, |
| "grad_norm": 0.831910252571106, |
| "learning_rate": 8.884712142367024e-06, |
| "loss": 0.4923, |
| "step": 27380 |
| }, |
| { |
| "epoch": 0.8704215508751866, |
| "grad_norm": 0.9581566452980042, |
| "learning_rate": 8.80011102580941e-06, |
| "loss": 0.4856, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.8710568950729057, |
| "grad_norm": 0.823250412940979, |
| "learning_rate": 8.720097656085246e-06, |
| "loss": 0.4886, |
| "step": 27420 |
| }, |
| { |
| "epoch": 0.8716922392706249, |
| "grad_norm": 0.988389253616333, |
| "learning_rate": 8.636249915153039e-06, |
| "loss": 0.4946, |
| "step": 27440 |
| }, |
| { |
| "epoch": 0.8723275834683439, |
| "grad_norm": 0.85055011510849, |
| "learning_rate": 8.55695289500451e-06, |
| "loss": 0.4885, |
| "step": 27460 |
| }, |
| { |
| "epoch": 0.8729629276660631, |
| "grad_norm": 0.9092792272567749, |
| "learning_rate": 8.473859879755397e-06, |
| "loss": 0.4631, |
| "step": 27480 |
| }, |
| { |
| "epoch": 0.8735982718637822, |
| "grad_norm": 0.930949330329895, |
| "learning_rate": 8.39115442306171e-06, |
| "loss": 0.4955, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.8742336160615013, |
| "grad_norm": 0.7822802066802979, |
| "learning_rate": 8.308836875131665e-06, |
| "loss": 0.4842, |
| "step": 27520 |
| }, |
| { |
| "epoch": 0.8748689602592205, |
| "grad_norm": 0.7877179384231567, |
| "learning_rate": 8.22690758453094e-06, |
| "loss": 0.5006, |
| "step": 27540 |
| }, |
| { |
| "epoch": 0.8755043044569395, |
| "grad_norm": 0.9965065717697144, |
| "learning_rate": 8.145366898181139e-06, |
| "loss": 0.4866, |
| "step": 27560 |
| }, |
| { |
| "epoch": 0.8761396486546587, |
| "grad_norm": 1.1015229225158691, |
| "learning_rate": 8.064215161358402e-06, |
| "loss": 0.5203, |
| "step": 27580 |
| }, |
| { |
| "epoch": 0.8767749928523778, |
| "grad_norm": 0.7929244637489319, |
| "learning_rate": 7.983452717691852e-06, |
| "loss": 0.477, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.8774103370500969, |
| "grad_norm": 1.0685256719589233, |
| "learning_rate": 7.903079909162258e-06, |
| "loss": 0.5385, |
| "step": 27620 |
| }, |
| { |
| "epoch": 0.878045681247816, |
| "grad_norm": 1.0020925998687744, |
| "learning_rate": 7.82309707610046e-06, |
| "loss": 0.5061, |
| "step": 27640 |
| }, |
| { |
| "epoch": 0.8786810254455352, |
| "grad_norm": 0.8348806500434875, |
| "learning_rate": 7.743504557185976e-06, |
| "loss": 0.505, |
| "step": 27660 |
| }, |
| { |
| "epoch": 0.8793163696432542, |
| "grad_norm": 0.8327703475952148, |
| "learning_rate": 7.664302689445635e-06, |
| "loss": 0.4633, |
| "step": 27680 |
| }, |
| { |
| "epoch": 0.8799517138409734, |
| "grad_norm": 0.9524950385093689, |
| "learning_rate": 7.5854918082520435e-06, |
| "loss": 0.4859, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.8805870580386924, |
| "grad_norm": 0.8677568435668945, |
| "learning_rate": 7.507072247322211e-06, |
| "loss": 0.4832, |
| "step": 27720 |
| }, |
| { |
| "epoch": 0.8812224022364116, |
| "grad_norm": 0.9326565265655518, |
| "learning_rate": 7.429044338716196e-06, |
| "loss": 0.493, |
| "step": 27740 |
| }, |
| { |
| "epoch": 0.8818577464341307, |
| "grad_norm": 0.7510032057762146, |
| "learning_rate": 7.35140841283557e-06, |
| "loss": 0.489, |
| "step": 27760 |
| }, |
| { |
| "epoch": 0.8824930906318498, |
| "grad_norm": 0.7510486841201782, |
| "learning_rate": 7.274164798422134e-06, |
| "loss": 0.4741, |
| "step": 27780 |
| }, |
| { |
| "epoch": 0.8831284348295689, |
| "grad_norm": 0.8744218945503235, |
| "learning_rate": 7.197313822556462e-06, |
| "loss": 0.4698, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.8837637790272881, |
| "grad_norm": 0.7554096579551697, |
| "learning_rate": 7.12085581065658e-06, |
| "loss": 0.4561, |
| "step": 27820 |
| }, |
| { |
| "epoch": 0.8843991232250071, |
| "grad_norm": 1.0702250003814697, |
| "learning_rate": 7.044791086476499e-06, |
| "loss": 0.5074, |
| "step": 27840 |
| }, |
| { |
| "epoch": 0.8850344674227263, |
| "grad_norm": 1.2190712690353394, |
| "learning_rate": 6.969119972104898e-06, |
| "loss": 0.4873, |
| "step": 27860 |
| }, |
| { |
| "epoch": 0.8856698116204453, |
| "grad_norm": 0.8235007524490356, |
| "learning_rate": 6.893842787963789e-06, |
| "loss": 0.4884, |
| "step": 27880 |
| }, |
| { |
| "epoch": 0.8863051558181645, |
| "grad_norm": 0.8809916973114014, |
| "learning_rate": 6.818959852807083e-06, |
| "loss": 0.4746, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.8869405000158836, |
| "grad_norm": 0.8362717628479004, |
| "learning_rate": 6.744471483719306e-06, |
| "loss": 0.5139, |
| "step": 27920 |
| }, |
| { |
| "epoch": 0.8875758442136027, |
| "grad_norm": 0.9398446083068848, |
| "learning_rate": 6.67037799611423e-06, |
| "loss": 0.5002, |
| "step": 27940 |
| }, |
| { |
| "epoch": 0.8882111884113219, |
| "grad_norm": 0.750577449798584, |
| "learning_rate": 6.596679703733544e-06, |
| "loss": 0.4965, |
| "step": 27960 |
| }, |
| { |
| "epoch": 0.8888465326090409, |
| "grad_norm": 1.0199640989303589, |
| "learning_rate": 6.523376918645474e-06, |
| "loss": 0.5101, |
| "step": 27980 |
| }, |
| { |
| "epoch": 0.88948187680676, |
| "grad_norm": 0.8302307724952698, |
| "learning_rate": 6.4504699512435985e-06, |
| "loss": 0.4608, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.88948187680676, |
| "eval_loss": 0.4442509412765503, |
| "eval_runtime": 44.8835, |
| "eval_samples_per_second": 60.223, |
| "eval_steps_per_second": 30.122, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.8901172210044792, |
| "grad_norm": 0.7648799419403076, |
| "learning_rate": 6.377959110245357e-06, |
| "loss": 0.4704, |
| "step": 28020 |
| }, |
| { |
| "epoch": 0.8907525652021983, |
| "grad_norm": 0.8950293064117432, |
| "learning_rate": 6.305844702690878e-06, |
| "loss": 0.4906, |
| "step": 28040 |
| }, |
| { |
| "epoch": 0.8913879093999174, |
| "grad_norm": 0.9124616384506226, |
| "learning_rate": 6.234127033941628e-06, |
| "loss": 0.4939, |
| "step": 28060 |
| }, |
| { |
| "epoch": 0.8920232535976366, |
| "grad_norm": 0.8970253467559814, |
| "learning_rate": 6.1628064076791e-06, |
| "loss": 0.5088, |
| "step": 28080 |
| }, |
| { |
| "epoch": 0.8926585977953556, |
| "grad_norm": 0.9791019558906555, |
| "learning_rate": 6.091883125903575e-06, |
| "loss": 0.4613, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.8932939419930748, |
| "grad_norm": 1.3384908437728882, |
| "learning_rate": 6.021357488932789e-06, |
| "loss": 0.4737, |
| "step": 28120 |
| }, |
| { |
| "epoch": 0.8939292861907938, |
| "grad_norm": 1.076692819595337, |
| "learning_rate": 5.951229795400726e-06, |
| "loss": 0.5094, |
| "step": 28140 |
| }, |
| { |
| "epoch": 0.894564630388513, |
| "grad_norm": 0.9772495031356812, |
| "learning_rate": 5.881500342256285e-06, |
| "loss": 0.4791, |
| "step": 28160 |
| }, |
| { |
| "epoch": 0.8951999745862321, |
| "grad_norm": 0.946626603603363, |
| "learning_rate": 5.8121694247620485e-06, |
| "loss": 0.4843, |
| "step": 28180 |
| }, |
| { |
| "epoch": 0.8958353187839512, |
| "grad_norm": 0.9328265190124512, |
| "learning_rate": 5.74323733649309e-06, |
| "loss": 0.4822, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.8964706629816703, |
| "grad_norm": 0.7450932264328003, |
| "learning_rate": 5.674704369335637e-06, |
| "loss": 0.4746, |
| "step": 28220 |
| }, |
| { |
| "epoch": 0.8971060071793895, |
| "grad_norm": 1.0023432970046997, |
| "learning_rate": 5.606570813485856e-06, |
| "loss": 0.4941, |
| "step": 28240 |
| }, |
| { |
| "epoch": 0.8977413513771085, |
| "grad_norm": 0.8717949986457825, |
| "learning_rate": 5.538836957448712e-06, |
| "loss": 0.4801, |
| "step": 28260 |
| }, |
| { |
| "epoch": 0.8983766955748277, |
| "grad_norm": 0.8665459156036377, |
| "learning_rate": 5.474860277416504e-06, |
| "loss": 0.4782, |
| "step": 28280 |
| }, |
| { |
| "epoch": 0.8990120397725467, |
| "grad_norm": 0.8660995364189148, |
| "learning_rate": 5.407906659415618e-06, |
| "loss": 0.4788, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.8996473839702659, |
| "grad_norm": 0.9390355944633484, |
| "learning_rate": 5.341353582451425e-06, |
| "loss": 0.478, |
| "step": 28320 |
| }, |
| { |
| "epoch": 0.900282728167985, |
| "grad_norm": 0.8287180662155151, |
| "learning_rate": 5.275201328336477e-06, |
| "loss": 0.4846, |
| "step": 28340 |
| }, |
| { |
| "epoch": 0.9009180723657041, |
| "grad_norm": 0.8496334552764893, |
| "learning_rate": 5.209450177186081e-06, |
| "loss": 0.4838, |
| "step": 28360 |
| }, |
| { |
| "epoch": 0.9015534165634232, |
| "grad_norm": 0.9892422556877136, |
| "learning_rate": 5.144100407417063e-06, |
| "loss": 0.4854, |
| "step": 28380 |
| }, |
| { |
| "epoch": 0.9021887607611423, |
| "grad_norm": 0.9813452363014221, |
| "learning_rate": 5.0791522957467365e-06, |
| "loss": 0.4916, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.9028241049588615, |
| "grad_norm": 0.9126195907592773, |
| "learning_rate": 5.014606117191545e-06, |
| "loss": 0.4949, |
| "step": 28420 |
| }, |
| { |
| "epoch": 0.9034594491565806, |
| "grad_norm": 0.8669445514678955, |
| "learning_rate": 4.950462145066015e-06, |
| "loss": 0.482, |
| "step": 28440 |
| }, |
| { |
| "epoch": 0.9040947933542997, |
| "grad_norm": 0.9803065657615662, |
| "learning_rate": 4.886720650981569e-06, |
| "loss": 0.5025, |
| "step": 28460 |
| }, |
| { |
| "epoch": 0.9047301375520188, |
| "grad_norm": 0.9414586424827576, |
| "learning_rate": 4.823381904845392e-06, |
| "loss": 0.4856, |
| "step": 28480 |
| }, |
| { |
| "epoch": 0.905365481749738, |
| "grad_norm": 0.9295367002487183, |
| "learning_rate": 4.760446174859224e-06, |
| "loss": 0.4876, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.906000825947457, |
| "grad_norm": 0.8859279751777649, |
| "learning_rate": 4.697913727518332e-06, |
| "loss": 0.5152, |
| "step": 28520 |
| }, |
| { |
| "epoch": 0.9066361701451762, |
| "grad_norm": 0.7441398501396179, |
| "learning_rate": 4.63578482761029e-06, |
| "loss": 0.4787, |
| "step": 28540 |
| }, |
| { |
| "epoch": 0.9072715143428952, |
| "grad_norm": 1.459954023361206, |
| "learning_rate": 4.574059738213876e-06, |
| "loss": 0.4813, |
| "step": 28560 |
| }, |
| { |
| "epoch": 0.9079068585406144, |
| "grad_norm": 0.9451243281364441, |
| "learning_rate": 4.512738720698018e-06, |
| "loss": 0.4835, |
| "step": 28580 |
| }, |
| { |
| "epoch": 0.9085422027383335, |
| "grad_norm": 0.8990492820739746, |
| "learning_rate": 4.451822034720587e-06, |
| "loss": 0.4811, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.9091775469360526, |
| "grad_norm": 0.7530508637428284, |
| "learning_rate": 4.3913099382273835e-06, |
| "loss": 0.5, |
| "step": 28620 |
| }, |
| { |
| "epoch": 0.9098128911337717, |
| "grad_norm": 0.8113830089569092, |
| "learning_rate": 4.331202687451019e-06, |
| "loss": 0.5075, |
| "step": 28640 |
| }, |
| { |
| "epoch": 0.9104482353314909, |
| "grad_norm": 0.8615418672561646, |
| "learning_rate": 4.2715005369097895e-06, |
| "loss": 0.5152, |
| "step": 28660 |
| }, |
| { |
| "epoch": 0.9110835795292099, |
| "grad_norm": 0.8459773659706116, |
| "learning_rate": 4.212203739406673e-06, |
| "loss": 0.4804, |
| "step": 28680 |
| }, |
| { |
| "epoch": 0.9117189237269291, |
| "grad_norm": 0.8821284770965576, |
| "learning_rate": 4.153312546028199e-06, |
| "loss": 0.5311, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.9123542679246481, |
| "grad_norm": 1.0187216997146606, |
| "learning_rate": 4.0948272061434035e-06, |
| "loss": 0.4632, |
| "step": 28720 |
| }, |
| { |
| "epoch": 0.9129896121223673, |
| "grad_norm": 0.9274182915687561, |
| "learning_rate": 4.036747967402788e-06, |
| "loss": 0.4832, |
| "step": 28740 |
| }, |
| { |
| "epoch": 0.9136249563200864, |
| "grad_norm": 0.7573745846748352, |
| "learning_rate": 3.979075075737226e-06, |
| "loss": 0.4905, |
| "step": 28760 |
| }, |
| { |
| "epoch": 0.9142603005178055, |
| "grad_norm": 0.9005789160728455, |
| "learning_rate": 3.921808775357027e-06, |
| "loss": 0.5114, |
| "step": 28780 |
| }, |
| { |
| "epoch": 0.9148956447155246, |
| "grad_norm": 0.9073104858398438, |
| "learning_rate": 3.864949308750743e-06, |
| "loss": 0.5018, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.9155309889132438, |
| "grad_norm": 0.7230907678604126, |
| "learning_rate": 3.808496916684268e-06, |
| "loss": 0.4954, |
| "step": 28820 |
| }, |
| { |
| "epoch": 0.9161663331109628, |
| "grad_norm": 0.7139384746551514, |
| "learning_rate": 3.7524518381997885e-06, |
| "loss": 0.464, |
| "step": 28840 |
| }, |
| { |
| "epoch": 0.916801677308682, |
| "grad_norm": 0.8710399866104126, |
| "learning_rate": 3.696814310614749e-06, |
| "loss": 0.5048, |
| "step": 28860 |
| }, |
| { |
| "epoch": 0.917437021506401, |
| "grad_norm": 0.87566739320755, |
| "learning_rate": 3.6415845695208505e-06, |
| "loss": 0.484, |
| "step": 28880 |
| }, |
| { |
| "epoch": 0.9180723657041202, |
| "grad_norm": 0.9447526335716248, |
| "learning_rate": 3.586762848783076e-06, |
| "loss": 0.5032, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.9187077099018394, |
| "grad_norm": 0.7784162759780884, |
| "learning_rate": 3.53234938053868e-06, |
| "loss": 0.4451, |
| "step": 28920 |
| }, |
| { |
| "epoch": 0.9193430540995584, |
| "grad_norm": 0.9225743412971497, |
| "learning_rate": 3.478344395196198e-06, |
| "loss": 0.4745, |
| "step": 28940 |
| }, |
| { |
| "epoch": 0.9199783982972776, |
| "grad_norm": 0.9712013602256775, |
| "learning_rate": 3.4247481214345177e-06, |
| "loss": 0.4956, |
| "step": 28960 |
| }, |
| { |
| "epoch": 0.9206137424949966, |
| "grad_norm": 1.2805237770080566, |
| "learning_rate": 3.371560786201855e-06, |
| "loss": 0.4971, |
| "step": 28980 |
| }, |
| { |
| "epoch": 0.9212490866927158, |
| "grad_norm": 0.7866525053977966, |
| "learning_rate": 3.3187826147147994e-06, |
| "loss": 0.497, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.9212490866927158, |
| "eval_loss": 0.44399821758270264, |
| "eval_runtime": 45.0357, |
| "eval_samples_per_second": 60.019, |
| "eval_steps_per_second": 30.021, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.9218844308904349, |
| "grad_norm": 0.7901077270507812, |
| "learning_rate": 3.2664138304574153e-06, |
| "loss": 0.514, |
| "step": 29020 |
| }, |
| { |
| "epoch": 0.922519775088154, |
| "grad_norm": 1.0464386940002441, |
| "learning_rate": 3.2144546551802323e-06, |
| "loss": 0.5042, |
| "step": 29040 |
| }, |
| { |
| "epoch": 0.9231551192858731, |
| "grad_norm": 0.8520443439483643, |
| "learning_rate": 3.162905308899322e-06, |
| "loss": 0.4858, |
| "step": 29060 |
| }, |
| { |
| "epoch": 0.9237904634835923, |
| "grad_norm": 0.92030268907547, |
| "learning_rate": 3.1117660098953895e-06, |
| "loss": 0.4766, |
| "step": 29080 |
| }, |
| { |
| "epoch": 0.9244258076813113, |
| "grad_norm": 0.7019485235214233, |
| "learning_rate": 3.06103697471285e-06, |
| "loss": 0.4903, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.9250611518790305, |
| "grad_norm": 1.3560097217559814, |
| "learning_rate": 3.0107184181588643e-06, |
| "loss": 0.5125, |
| "step": 29120 |
| }, |
| { |
| "epoch": 0.9256964960767495, |
| "grad_norm": 0.9616526365280151, |
| "learning_rate": 2.960810553302462e-06, |
| "loss": 0.512, |
| "step": 29140 |
| }, |
| { |
| "epoch": 0.9263318402744687, |
| "grad_norm": 1.1742409467697144, |
| "learning_rate": 2.9113135914736856e-06, |
| "loss": 0.5007, |
| "step": 29160 |
| }, |
| { |
| "epoch": 0.9269671844721878, |
| "grad_norm": 0.8712571263313293, |
| "learning_rate": 2.8622277422625907e-06, |
| "loss": 0.4717, |
| "step": 29180 |
| }, |
| { |
| "epoch": 0.9276025286699069, |
| "grad_norm": 0.8578605055809021, |
| "learning_rate": 2.8135532135184384e-06, |
| "loss": 0.4989, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.928237872867626, |
| "grad_norm": 0.8551231026649475, |
| "learning_rate": 2.7652902113488143e-06, |
| "loss": 0.4825, |
| "step": 29220 |
| }, |
| { |
| "epoch": 0.9288732170653452, |
| "grad_norm": 0.82204669713974, |
| "learning_rate": 2.7174389401186996e-06, |
| "loss": 0.4702, |
| "step": 29240 |
| }, |
| { |
| "epoch": 0.9295085612630642, |
| "grad_norm": 0.9263904690742493, |
| "learning_rate": 2.6699996024496575e-06, |
| "loss": 0.4996, |
| "step": 29260 |
| }, |
| { |
| "epoch": 0.9301439054607834, |
| "grad_norm": 1.037817120552063, |
| "learning_rate": 2.6229723992189704e-06, |
| "loss": 0.4986, |
| "step": 29280 |
| }, |
| { |
| "epoch": 0.9307792496585024, |
| "grad_norm": 1.0528874397277832, |
| "learning_rate": 2.5763575295587593e-06, |
| "loss": 0.4794, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.9314145938562216, |
| "grad_norm": 0.8765133619308472, |
| "learning_rate": 2.5301551908551545e-06, |
| "loss": 0.4878, |
| "step": 29320 |
| }, |
| { |
| "epoch": 0.9320499380539408, |
| "grad_norm": 0.8322685956954956, |
| "learning_rate": 2.484365578747494e-06, |
| "loss": 0.4945, |
| "step": 29340 |
| }, |
| { |
| "epoch": 0.9326852822516598, |
| "grad_norm": 0.8344667553901672, |
| "learning_rate": 2.438988887127436e-06, |
| "loss": 0.4981, |
| "step": 29360 |
| }, |
| { |
| "epoch": 0.933320626449379, |
| "grad_norm": 0.8750690817832947, |
| "learning_rate": 2.3940253081381703e-06, |
| "loss": 0.4969, |
| "step": 29380 |
| }, |
| { |
| "epoch": 0.933955970647098, |
| "grad_norm": 0.808814287185669, |
| "learning_rate": 2.3494750321736093e-06, |
| "loss": 0.4623, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.9345913148448172, |
| "grad_norm": 0.9626306891441345, |
| "learning_rate": 2.3053382478775754e-06, |
| "loss": 0.5028, |
| "step": 29420 |
| }, |
| { |
| "epoch": 0.9352266590425363, |
| "grad_norm": 0.9727978706359863, |
| "learning_rate": 2.261615142143003e-06, |
| "loss": 0.5059, |
| "step": 29440 |
| }, |
| { |
| "epoch": 0.9358620032402554, |
| "grad_norm": 0.8926533460617065, |
| "learning_rate": 2.2183059001111174e-06, |
| "loss": 0.4764, |
| "step": 29460 |
| }, |
| { |
| "epoch": 0.9364973474379745, |
| "grad_norm": 1.0506230592727661, |
| "learning_rate": 2.1754107051707218e-06, |
| "loss": 0.5069, |
| "step": 29480 |
| }, |
| { |
| "epoch": 0.9371326916356937, |
| "grad_norm": 0.7190736532211304, |
| "learning_rate": 2.1329297389573565e-06, |
| "loss": 0.49, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.9377680358334127, |
| "grad_norm": 0.7786980867385864, |
| "learning_rate": 2.09086318135252e-06, |
| "loss": 0.4766, |
| "step": 29520 |
| }, |
| { |
| "epoch": 0.9384033800311319, |
| "grad_norm": 0.8696832060813904, |
| "learning_rate": 2.049211210483004e-06, |
| "loss": 0.4959, |
| "step": 29540 |
| }, |
| { |
| "epoch": 0.9390387242288509, |
| "grad_norm": 0.7167271375656128, |
| "learning_rate": 2.0079740027200144e-06, |
| "loss": 0.4927, |
| "step": 29560 |
| }, |
| { |
| "epoch": 0.9396740684265701, |
| "grad_norm": 0.868259072303772, |
| "learning_rate": 1.967151732678518e-06, |
| "loss": 0.4788, |
| "step": 29580 |
| }, |
| { |
| "epoch": 0.9403094126242892, |
| "grad_norm": 0.8658266663551331, |
| "learning_rate": 1.9267445732164325e-06, |
| "loss": 0.4919, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.9409447568220083, |
| "grad_norm": 1.010276436805725, |
| "learning_rate": 1.8867526954339688e-06, |
| "loss": 0.4811, |
| "step": 29620 |
| }, |
| { |
| "epoch": 0.9415801010197274, |
| "grad_norm": 0.9376817941665649, |
| "learning_rate": 1.8471762686728344e-06, |
| "loss": 0.4723, |
| "step": 29640 |
| }, |
| { |
| "epoch": 0.9422154452174466, |
| "grad_norm": 1.520297646522522, |
| "learning_rate": 1.8080154605155996e-06, |
| "loss": 0.5146, |
| "step": 29660 |
| }, |
| { |
| "epoch": 0.9428507894151656, |
| "grad_norm": 0.8532717227935791, |
| "learning_rate": 1.7692704367848756e-06, |
| "loss": 0.4556, |
| "step": 29680 |
| }, |
| { |
| "epoch": 0.9434861336128848, |
| "grad_norm": 1.069378137588501, |
| "learning_rate": 1.730941361542704e-06, |
| "loss": 0.4789, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.9441214778106038, |
| "grad_norm": 0.8771205544471741, |
| "learning_rate": 1.6930283970898574e-06, |
| "loss": 0.4819, |
| "step": 29720 |
| }, |
| { |
| "epoch": 0.944756822008323, |
| "grad_norm": 0.8729512095451355, |
| "learning_rate": 1.6555317039650852e-06, |
| "loss": 0.4792, |
| "step": 29740 |
| }, |
| { |
| "epoch": 0.9453921662060422, |
| "grad_norm": 0.8724381923675537, |
| "learning_rate": 1.6184514409444795e-06, |
| "loss": 0.4726, |
| "step": 29760 |
| }, |
| { |
| "epoch": 0.9460275104037612, |
| "grad_norm": 0.9022035598754883, |
| "learning_rate": 1.5817877650408541e-06, |
| "loss": 0.4891, |
| "step": 29780 |
| }, |
| { |
| "epoch": 0.9466628546014804, |
| "grad_norm": 1.003596544265747, |
| "learning_rate": 1.5455408315029562e-06, |
| "loss": 0.4974, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.9472981987991995, |
| "grad_norm": 0.8569382429122925, |
| "learning_rate": 1.5097107938149113e-06, |
| "loss": 0.4781, |
| "step": 29820 |
| }, |
| { |
| "epoch": 0.9479335429969186, |
| "grad_norm": 0.9094131588935852, |
| "learning_rate": 1.4742978036955457e-06, |
| "loss": 0.5155, |
| "step": 29840 |
| }, |
| { |
| "epoch": 0.9485688871946377, |
| "grad_norm": 1.0451712608337402, |
| "learning_rate": 1.4393020110977206e-06, |
| "loss": 0.4895, |
| "step": 29860 |
| }, |
| { |
| "epoch": 0.9492042313923568, |
| "grad_norm": 1.2386709451675415, |
| "learning_rate": 1.4047235642077217e-06, |
| "loss": 0.4702, |
| "step": 29880 |
| }, |
| { |
| "epoch": 0.9498395755900759, |
| "grad_norm": 0.966143786907196, |
| "learning_rate": 1.3705626094446256e-06, |
| "loss": 0.4962, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.9504749197877951, |
| "grad_norm": 0.9544230103492737, |
| "learning_rate": 1.33681929145969e-06, |
| "loss": 0.4788, |
| "step": 29920 |
| }, |
| { |
| "epoch": 0.9511102639855141, |
| "grad_norm": 0.8583151698112488, |
| "learning_rate": 1.3034937531357095e-06, |
| "loss": 0.477, |
| "step": 29940 |
| }, |
| { |
| "epoch": 0.9517456081832333, |
| "grad_norm": 0.8361521363258362, |
| "learning_rate": 1.270586135586427e-06, |
| "loss": 0.5162, |
| "step": 29960 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 1.0520914793014526, |
| "learning_rate": 1.2380965781559783e-06, |
| "loss": 0.4762, |
| "step": 29980 |
| }, |
| { |
| "epoch": 0.9530162965786715, |
| "grad_norm": 0.8727782964706421, |
| "learning_rate": 1.2060252184182386e-06, |
| "loss": 0.4929, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.9530162965786715, |
| "eval_loss": 0.443807452917099, |
| "eval_runtime": 44.5933, |
| "eval_samples_per_second": 60.614, |
| "eval_steps_per_second": 30.318, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.9536516407763906, |
| "grad_norm": 0.7989442944526672, |
| "learning_rate": 1.174372192176254e-06, |
| "loss": 0.4932, |
| "step": 30020 |
| }, |
| { |
| "epoch": 0.9542869849741097, |
| "grad_norm": 0.7544863224029541, |
| "learning_rate": 1.1431376334616994e-06, |
| "loss": 0.482, |
| "step": 30040 |
| }, |
| { |
| "epoch": 0.9549223291718288, |
| "grad_norm": 0.8897516131401062, |
| "learning_rate": 1.1123216745342779e-06, |
| "loss": 0.4898, |
| "step": 30060 |
| }, |
| { |
| "epoch": 0.955557673369548, |
| "grad_norm": 0.8291769027709961, |
| "learning_rate": 1.0819244458811773e-06, |
| "loss": 0.5021, |
| "step": 30080 |
| }, |
| { |
| "epoch": 0.956193017567267, |
| "grad_norm": 0.8413028717041016, |
| "learning_rate": 1.0519460762165144e-06, |
| "loss": 0.4762, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.9568283617649862, |
| "grad_norm": 0.9216207265853882, |
| "learning_rate": 1.0223866924807924e-06, |
| "loss": 0.4869, |
| "step": 30120 |
| }, |
| { |
| "epoch": 0.9574637059627052, |
| "grad_norm": 0.8935249447822571, |
| "learning_rate": 9.932464198403325e-07, |
| "loss": 0.4928, |
| "step": 30140 |
| }, |
| { |
| "epoch": 0.9580990501604244, |
| "grad_norm": 0.7496423721313477, |
| "learning_rate": 9.645253816867983e-07, |
| "loss": 0.5266, |
| "step": 30160 |
| }, |
| { |
| "epoch": 0.9587343943581436, |
| "grad_norm": 0.9738262295722961, |
| "learning_rate": 9.362236996366514e-07, |
| "loss": 0.4735, |
| "step": 30180 |
| }, |
| { |
| "epoch": 0.9593697385558626, |
| "grad_norm": 0.9249958395957947, |
| "learning_rate": 9.083414935305956e-07, |
| "loss": 0.4706, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.9600050827535818, |
| "grad_norm": 1.0667359828948975, |
| "learning_rate": 8.808788814331448e-07, |
| "loss": 0.4721, |
| "step": 30220 |
| }, |
| { |
| "epoch": 0.9606404269513009, |
| "grad_norm": 0.8088135123252869, |
| "learning_rate": 8.53835979632056e-07, |
| "loss": 0.4884, |
| "step": 30240 |
| }, |
| { |
| "epoch": 0.96127577114902, |
| "grad_norm": 0.9164936542510986, |
| "learning_rate": 8.272129026378639e-07, |
| "loss": 0.5022, |
| "step": 30260 |
| }, |
| { |
| "epoch": 0.9619111153467391, |
| "grad_norm": 0.7835588455200195, |
| "learning_rate": 8.010097631834245e-07, |
| "loss": 0.4707, |
| "step": 30280 |
| }, |
| { |
| "epoch": 0.9625464595444582, |
| "grad_norm": 1.2730233669281006, |
| "learning_rate": 7.752266722233614e-07, |
| "loss": 0.4795, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.9631818037421773, |
| "grad_norm": 0.9977156519889832, |
| "learning_rate": 7.511219051883567e-07, |
| "loss": 0.5209, |
| "step": 30320 |
| }, |
| { |
| "epoch": 0.9638171479398965, |
| "grad_norm": 0.941656231880188, |
| "learning_rate": 7.26158221189377e-07, |
| "loss": 0.4747, |
| "step": 30340 |
| }, |
| { |
| "epoch": 0.9644524921376155, |
| "grad_norm": 0.7258419990539551, |
| "learning_rate": 7.028320832731084e-07, |
| "loss": 0.4961, |
| "step": 30360 |
| }, |
| { |
| "epoch": 0.9650878363353347, |
| "grad_norm": 0.974557638168335, |
| "learning_rate": 6.786882081830093e-07, |
| "loss": 0.4559, |
| "step": 30380 |
| }, |
| { |
| "epoch": 0.9657231805330537, |
| "grad_norm": 0.973461925983429, |
| "learning_rate": 6.549648995460511e-07, |
| "loss": 0.4931, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.9663585247307729, |
| "grad_norm": 1.0066043138504028, |
| "learning_rate": 6.31662257816279e-07, |
| "loss": 0.4901, |
| "step": 30420 |
| }, |
| { |
| "epoch": 0.966993868928492, |
| "grad_norm": 0.9339585900306702, |
| "learning_rate": 6.087803816664628e-07, |
| "loss": 0.4697, |
| "step": 30440 |
| }, |
| { |
| "epoch": 0.9676292131262111, |
| "grad_norm": 0.8802968859672546, |
| "learning_rate": 5.863193679877088e-07, |
| "loss": 0.4943, |
| "step": 30460 |
| }, |
| { |
| "epoch": 0.9682645573239302, |
| "grad_norm": 0.7557999491691589, |
| "learning_rate": 5.6427931188896e-07, |
| "loss": 0.4761, |
| "step": 30480 |
| }, |
| { |
| "epoch": 0.9688999015216494, |
| "grad_norm": 0.9139352440834045, |
| "learning_rate": 5.426603066967295e-07, |
| "loss": 0.476, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.9695352457193684, |
| "grad_norm": 0.9125082492828369, |
| "learning_rate": 5.21462443954579e-07, |
| "loss": 0.4792, |
| "step": 30520 |
| }, |
| { |
| "epoch": 0.9701705899170876, |
| "grad_norm": 0.9351817965507507, |
| "learning_rate": 5.006858134228076e-07, |
| "loss": 0.4976, |
| "step": 30540 |
| }, |
| { |
| "epoch": 0.9708059341148066, |
| "grad_norm": 0.743870735168457, |
| "learning_rate": 4.803305030780302e-07, |
| "loss": 0.4695, |
| "step": 30560 |
| }, |
| { |
| "epoch": 0.9714412783125258, |
| "grad_norm": 0.9468183517456055, |
| "learning_rate": 4.603965991128445e-07, |
| "loss": 0.5027, |
| "step": 30580 |
| }, |
| { |
| "epoch": 0.972076622510245, |
| "grad_norm": 1.1194064617156982, |
| "learning_rate": 4.408841859354307e-07, |
| "loss": 0.5146, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.972711966707964, |
| "grad_norm": 0.7916650176048279, |
| "learning_rate": 4.21793346169197e-07, |
| "loss": 0.4689, |
| "step": 30620 |
| }, |
| { |
| "epoch": 0.9733473109056832, |
| "grad_norm": 0.9158383011817932, |
| "learning_rate": 4.0312416065245717e-07, |
| "loss": 0.5272, |
| "step": 30640 |
| }, |
| { |
| "epoch": 0.9739826551034023, |
| "grad_norm": 0.8861019015312195, |
| "learning_rate": 3.8487670843807555e-07, |
| "loss": 0.4981, |
| "step": 30660 |
| }, |
| { |
| "epoch": 0.9746179993011214, |
| "grad_norm": 1.01827871799469, |
| "learning_rate": 3.670510667931004e-07, |
| "loss": 0.5386, |
| "step": 30680 |
| }, |
| { |
| "epoch": 0.9752533434988405, |
| "grad_norm": 0.9622276425361633, |
| "learning_rate": 3.496473111984866e-07, |
| "loss": 0.5135, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.9758886876965596, |
| "grad_norm": 1.0768787860870361, |
| "learning_rate": 3.326655153487512e-07, |
| "loss": 0.4943, |
| "step": 30720 |
| }, |
| { |
| "epoch": 0.9765240318942787, |
| "grad_norm": 1.2705291509628296, |
| "learning_rate": 3.16105751151663e-07, |
| "loss": 0.4924, |
| "step": 30740 |
| }, |
| { |
| "epoch": 0.9771593760919979, |
| "grad_norm": 0.9354774951934814, |
| "learning_rate": 2.99968088727931e-07, |
| "loss": 0.4811, |
| "step": 30760 |
| }, |
| { |
| "epoch": 0.9777947202897169, |
| "grad_norm": 0.8442774415016174, |
| "learning_rate": 2.842525964109166e-07, |
| "loss": 0.4652, |
| "step": 30780 |
| }, |
| { |
| "epoch": 0.9784300644874361, |
| "grad_norm": 0.9658933281898499, |
| "learning_rate": 2.6895934074635533e-07, |
| "loss": 0.4767, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.9790654086851552, |
| "grad_norm": 0.9930063486099243, |
| "learning_rate": 2.5408838649204625e-07, |
| "loss": 0.4791, |
| "step": 30820 |
| }, |
| { |
| "epoch": 0.9797007528828743, |
| "grad_norm": 0.9439179301261902, |
| "learning_rate": 2.396397966176078e-07, |
| "loss": 0.4833, |
| "step": 30840 |
| }, |
| { |
| "epoch": 0.9803360970805934, |
| "grad_norm": 0.8499469757080078, |
| "learning_rate": 2.25613632304178e-07, |
| "loss": 0.4969, |
| "step": 30860 |
| }, |
| { |
| "epoch": 0.9809714412783125, |
| "grad_norm": 1.0228259563446045, |
| "learning_rate": 2.1200995294420323e-07, |
| "loss": 0.4709, |
| "step": 30880 |
| }, |
| { |
| "epoch": 0.9816067854760316, |
| "grad_norm": 1.1045747995376587, |
| "learning_rate": 1.988288161411389e-07, |
| "loss": 0.4964, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.9822421296737508, |
| "grad_norm": 0.8404049277305603, |
| "learning_rate": 1.8607027770921602e-07, |
| "loss": 0.5289, |
| "step": 30920 |
| }, |
| { |
| "epoch": 0.9828774738714698, |
| "grad_norm": 0.8583685755729675, |
| "learning_rate": 1.7373439167325257e-07, |
| "loss": 0.4824, |
| "step": 30940 |
| }, |
| { |
| "epoch": 0.983512818069189, |
| "grad_norm": 0.8340322375297546, |
| "learning_rate": 1.6240682931759622e-07, |
| "loss": 0.5276, |
| "step": 30960 |
| }, |
| { |
| "epoch": 0.984148162266908, |
| "grad_norm": 0.717254638671875, |
| "learning_rate": 1.508952640646988e-07, |
| "loss": 0.4837, |
| "step": 30980 |
| }, |
| { |
| "epoch": 0.9847835064646272, |
| "grad_norm": 0.7109520435333252, |
| "learning_rate": 1.3980650015292806e-07, |
| "loss": 0.4805, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.9847835064646272, |
| "eval_loss": 0.4438159465789795, |
| "eval_runtime": 44.826, |
| "eval_samples_per_second": 60.3, |
| "eval_steps_per_second": 30.161, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.9854188506623464, |
| "grad_norm": 0.8632842302322388, |
| "learning_rate": 1.2914058453658008e-07, |
| "loss": 0.4787, |
| "step": 31020 |
| }, |
| { |
| "epoch": 0.9860541948600654, |
| "grad_norm": 0.9302808046340942, |
| "learning_rate": 1.1889756237943861e-07, |
| "loss": 0.4733, |
| "step": 31040 |
| }, |
| { |
| "epoch": 0.9866895390577846, |
| "grad_norm": 1.0309478044509888, |
| "learning_rate": 1.090774770545755e-07, |
| "loss": 0.498, |
| "step": 31060 |
| }, |
| { |
| "epoch": 0.9873248832555037, |
| "grad_norm": 0.7432119250297546, |
| "learning_rate": 9.968037014420616e-08, |
| "loss": 0.4909, |
| "step": 31080 |
| }, |
| { |
| "epoch": 0.9879602274532228, |
| "grad_norm": 1.0406357049942017, |
| "learning_rate": 9.070628143946768e-08, |
| "loss": 0.4913, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.9885955716509419, |
| "grad_norm": 0.8807629346847534, |
| "learning_rate": 8.215524894024107e-08, |
| "loss": 0.4843, |
| "step": 31120 |
| }, |
| { |
| "epoch": 0.989230915848661, |
| "grad_norm": 0.815077006816864, |
| "learning_rate": 7.402730885507359e-08, |
| "loss": 0.4877, |
| "step": 31140 |
| }, |
| { |
| "epoch": 0.9898662600463801, |
| "grad_norm": 0.8051480054855347, |
| "learning_rate": 6.632249560092341e-08, |
| "loss": 0.489, |
| "step": 31160 |
| }, |
| { |
| "epoch": 0.9905016042440993, |
| "grad_norm": 0.8251180648803711, |
| "learning_rate": 5.9040841803081895e-08, |
| "loss": 0.4763, |
| "step": 31180 |
| }, |
| { |
| "epoch": 0.9911369484418183, |
| "grad_norm": 0.8782890439033508, |
| "learning_rate": 5.218237829499595e-08, |
| "loss": 0.5012, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.9917722926395375, |
| "grad_norm": 0.9451269507408142, |
| "learning_rate": 4.574713411816811e-08, |
| "loss": 0.4765, |
| "step": 31220 |
| }, |
| { |
| "epoch": 0.9924076368372566, |
| "grad_norm": 1.2340540885925293, |
| "learning_rate": 3.973513652202332e-08, |
| "loss": 0.4999, |
| "step": 31240 |
| }, |
| { |
| "epoch": 0.9930429810349757, |
| "grad_norm": 1.0101948976516724, |
| "learning_rate": 3.414641096376459e-08, |
| "loss": 0.5118, |
| "step": 31260 |
| }, |
| { |
| "epoch": 0.9936783252326948, |
| "grad_norm": 0.7806993722915649, |
| "learning_rate": 2.8980981108317485e-08, |
| "loss": 0.5068, |
| "step": 31280 |
| }, |
| { |
| "epoch": 0.9943136694304139, |
| "grad_norm": 1.1223636865615845, |
| "learning_rate": 2.4238868828196927e-08, |
| "loss": 0.5182, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.994949013628133, |
| "grad_norm": 0.8514977693557739, |
| "learning_rate": 1.9920094203418336e-08, |
| "loss": 0.5072, |
| "step": 31320 |
| }, |
| { |
| "epoch": 0.9955843578258522, |
| "grad_norm": 1.1318073272705078, |
| "learning_rate": 1.6024675521397747e-08, |
| "loss": 0.4819, |
| "step": 31340 |
| }, |
| { |
| "epoch": 0.9962197020235712, |
| "grad_norm": 0.9314286708831787, |
| "learning_rate": 1.2552629276929573e-08, |
| "loss": 0.4957, |
| "step": 31360 |
| }, |
| { |
| "epoch": 0.9968550462212904, |
| "grad_norm": 0.7769533395767212, |
| "learning_rate": 9.503970172031196e-09, |
| "loss": 0.5149, |
| "step": 31380 |
| }, |
| { |
| "epoch": 0.9974903904190096, |
| "grad_norm": 0.7601432800292969, |
| "learning_rate": 6.878711115976266e-09, |
| "loss": 0.4933, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.9981257346167286, |
| "grad_norm": 0.987147331237793, |
| "learning_rate": 4.6768632251614765e-09, |
| "loss": 0.4693, |
| "step": 31420 |
| }, |
| { |
| "epoch": 0.9987610788144478, |
| "grad_norm": 0.8807405829429626, |
| "learning_rate": 2.8984358230954577e-09, |
| "loss": 0.474, |
| "step": 31440 |
| }, |
| { |
| "epoch": 0.9993964230121668, |
| "grad_norm": 0.7518433332443237, |
| "learning_rate": 1.5434364403543733e-09, |
| "loss": 0.5076, |
| "step": 31460 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 31479, |
| "total_flos": 0.0, |
| "train_loss": 0.3508217529017671, |
| "train_runtime": 14676.7422, |
| "train_samples_per_second": 68.633, |
| "train_steps_per_second": 2.145 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 31479, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|