| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5767012687427913, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0014417531718569781, |
| "grad_norm": 3808.0, |
| "kd_loss": 0.4765625, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 2.4865, |
| "step": 5, |
| "student_loss": 1.2782293558120728, |
| "teacher_loss": 0.0020202866289764643 |
| }, |
| { |
| "epoch": 0.0028835063437139563, |
| "grad_norm": 496.0, |
| "kd_loss": 0.453125, |
| "learning_rate": 3e-06, |
| "loss": 2.0957, |
| "step": 10, |
| "student_loss": 1.0292338132858276, |
| "teacher_loss": 0.005245466250926256 |
| }, |
| { |
| "epoch": 0.004325259515570935, |
| "grad_norm": 238.0, |
| "kd_loss": 0.4453125, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 1.9295, |
| "step": 15, |
| "student_loss": 0.631219208240509, |
| "teacher_loss": 0.0013347615022212267 |
| }, |
| { |
| "epoch": 0.0057670126874279125, |
| "grad_norm": 133.0, |
| "kd_loss": 0.44140625, |
| "learning_rate": 6.333333333333333e-06, |
| "loss": 1.8503, |
| "step": 20, |
| "student_loss": 1.4391331672668457, |
| "teacher_loss": 0.0005473981145769358 |
| }, |
| { |
| "epoch": 0.00720876585928489, |
| "grad_norm": 75.5, |
| "kd_loss": 0.3984375, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.0972, |
| "step": 25, |
| "student_loss": 0.18605084717273712, |
| "teacher_loss": 0.0011649713851511478 |
| }, |
| { |
| "epoch": 0.00865051903114187, |
| "grad_norm": 24.25, |
| "kd_loss": 0.376953125, |
| "learning_rate": 9.666666666666667e-06, |
| "loss": 0.5883, |
| "step": 30, |
| "student_loss": 0.05895603448152542, |
| "teacher_loss": 0.030797353014349937 |
| }, |
| { |
| "epoch": 0.010092272202998846, |
| "grad_norm": 9.1875, |
| "kd_loss": 0.318359375, |
| "learning_rate": 9.99958042442916e-06, |
| "loss": 0.4657, |
| "step": 35, |
| "student_loss": 0.00801500491797924, |
| "teacher_loss": 0.04946871101856232 |
| }, |
| { |
| "epoch": 0.011534025374855825, |
| "grad_norm": 7.09375, |
| "kd_loss": 0.296875, |
| "learning_rate": 9.997876019358083e-06, |
| "loss": 0.402, |
| "step": 40, |
| "student_loss": 0.27407729625701904, |
| "teacher_loss": 0.04900167137384415 |
| }, |
| { |
| "epoch": 0.012975778546712802, |
| "grad_norm": 10.6875, |
| "kd_loss": 0.265625, |
| "learning_rate": 9.99486100792044e-06, |
| "loss": 0.3281, |
| "step": 45, |
| "student_loss": 0.3285456597805023, |
| "teacher_loss": 0.003808467648923397 |
| }, |
| { |
| "epoch": 0.01441753171856978, |
| "grad_norm": 21.875, |
| "kd_loss": 0.2578125, |
| "learning_rate": 9.990536180750724e-06, |
| "loss": 0.351, |
| "step": 50, |
| "student_loss": 0.03528054431080818, |
| "teacher_loss": 0.04187293350696564 |
| }, |
| { |
| "epoch": 0.015859284890426758, |
| "grad_norm": 228.0, |
| "kd_loss": 0.2421875, |
| "learning_rate": 9.984902671959911e-06, |
| "loss": 0.3368, |
| "step": 55, |
| "student_loss": 0.004587164148688316, |
| "teacher_loss": 0.0026238495483994484 |
| }, |
| { |
| "epoch": 0.01730103806228374, |
| "grad_norm": 6.4375, |
| "kd_loss": 0.23046875, |
| "learning_rate": 9.97796195883804e-06, |
| "loss": 0.3291, |
| "step": 60, |
| "student_loss": 0.048888176679611206, |
| "teacher_loss": 0.0033420324325561523 |
| }, |
| { |
| "epoch": 0.018742791234140715, |
| "grad_norm": 6.375, |
| "kd_loss": 0.2099609375, |
| "learning_rate": 9.969715861466839e-06, |
| "loss": 0.3147, |
| "step": 65, |
| "student_loss": 0.11189773678779602, |
| "teacher_loss": 0.0413309670984745 |
| }, |
| { |
| "epoch": 0.020184544405997693, |
| "grad_norm": 4.09375, |
| "kd_loss": 0.208984375, |
| "learning_rate": 9.96016654224243e-06, |
| "loss": 0.3096, |
| "step": 70, |
| "student_loss": 0.026729928329586983, |
| "teacher_loss": 0.0019827873911708593 |
| }, |
| { |
| "epoch": 0.02162629757785467, |
| "grad_norm": 4.25, |
| "kd_loss": 0.1748046875, |
| "learning_rate": 9.94931650530827e-06, |
| "loss": 0.2729, |
| "step": 75, |
| "student_loss": 0.006473238579928875, |
| "teacher_loss": 0.005465318448841572 |
| }, |
| { |
| "epoch": 0.02306805074971165, |
| "grad_norm": 3.859375, |
| "kd_loss": 0.16796875, |
| "learning_rate": 9.93716859589851e-06, |
| "loss": 0.2662, |
| "step": 80, |
| "student_loss": 0.00662571657449007, |
| "teacher_loss": 0.004144964274019003 |
| }, |
| { |
| "epoch": 0.024509803921568627, |
| "grad_norm": 2.78125, |
| "kd_loss": 0.1640625, |
| "learning_rate": 9.923725999591846e-06, |
| "loss": 0.2261, |
| "step": 85, |
| "student_loss": 0.004623747896403074, |
| "teacher_loss": 0.002723712706938386 |
| }, |
| { |
| "epoch": 0.025951557093425604, |
| "grad_norm": 5.5625, |
| "kd_loss": 0.1943359375, |
| "learning_rate": 9.908992241476189e-06, |
| "loss": 0.2543, |
| "step": 90, |
| "student_loss": 0.11273087561130524, |
| "teacher_loss": 0.0015502618625760078 |
| }, |
| { |
| "epoch": 0.027393310265282585, |
| "grad_norm": 3.4375, |
| "kd_loss": 0.189453125, |
| "learning_rate": 9.892971185224244e-06, |
| "loss": 0.2267, |
| "step": 95, |
| "student_loss": 0.006197327747941017, |
| "teacher_loss": 0.008993545547127724 |
| }, |
| { |
| "epoch": 0.02883506343713956, |
| "grad_norm": 6.34375, |
| "kd_loss": 0.134765625, |
| "learning_rate": 9.875667032080354e-06, |
| "loss": 0.2274, |
| "step": 100, |
| "student_loss": 0.0032730416860431433, |
| "teacher_loss": 0.0036007578019052744 |
| }, |
| { |
| "epoch": 0.03027681660899654, |
| "grad_norm": 4.0, |
| "kd_loss": 0.146484375, |
| "learning_rate": 9.857084319758772e-06, |
| "loss": 0.2421, |
| "step": 105, |
| "student_loss": 0.04058241471648216, |
| "teacher_loss": 0.0012296679196879268 |
| }, |
| { |
| "epoch": 0.031718569780853516, |
| "grad_norm": 4.6875, |
| "kd_loss": 0.1787109375, |
| "learning_rate": 9.837227921253747e-06, |
| "loss": 0.2273, |
| "step": 110, |
| "student_loss": 0.004547884222120047, |
| "teacher_loss": 0.023880567401647568 |
| }, |
| { |
| "epoch": 0.03316032295271049, |
| "grad_norm": 2.625, |
| "kd_loss": 0.14453125, |
| "learning_rate": 9.816103043561648e-06, |
| "loss": 0.2142, |
| "step": 115, |
| "student_loss": 0.001855566632002592, |
| "teacher_loss": 0.0016716865357011557 |
| }, |
| { |
| "epoch": 0.03460207612456748, |
| "grad_norm": 3.078125, |
| "kd_loss": 0.1357421875, |
| "learning_rate": 9.79371522631553e-06, |
| "loss": 0.2149, |
| "step": 120, |
| "student_loss": 0.019737211987376213, |
| "teacher_loss": 0.0027425403241068125 |
| }, |
| { |
| "epoch": 0.036043829296424454, |
| "grad_norm": 3.390625, |
| "kd_loss": 0.146484375, |
| "learning_rate": 9.770070340332457e-06, |
| "loss": 0.1956, |
| "step": 125, |
| "student_loss": 0.10938042402267456, |
| "teacher_loss": 0.001064821844920516 |
| }, |
| { |
| "epoch": 0.03748558246828143, |
| "grad_norm": 3.734375, |
| "kd_loss": 0.146484375, |
| "learning_rate": 9.745174586073982e-06, |
| "loss": 0.2099, |
| "step": 130, |
| "student_loss": 0.0035836249589920044, |
| "teacher_loss": 0.002439548959955573 |
| }, |
| { |
| "epoch": 0.03892733564013841, |
| "grad_norm": 3.0, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 9.719034492020183e-06, |
| "loss": 0.202, |
| "step": 135, |
| "student_loss": 0.003862213110551238, |
| "teacher_loss": 0.0010834896238520741 |
| }, |
| { |
| "epoch": 0.040369088811995385, |
| "grad_norm": 7.1875, |
| "kd_loss": 0.1357421875, |
| "learning_rate": 9.691656912957686e-06, |
| "loss": 0.218, |
| "step": 140, |
| "student_loss": 0.0022195407655090094, |
| "teacher_loss": 0.0014228483196347952 |
| }, |
| { |
| "epoch": 0.04181084198385236, |
| "grad_norm": 3.703125, |
| "kd_loss": 0.150390625, |
| "learning_rate": 9.663049028182112e-06, |
| "loss": 0.2077, |
| "step": 145, |
| "student_loss": 0.11333022266626358, |
| "teacher_loss": 0.00682886503636837 |
| }, |
| { |
| "epoch": 0.04325259515570934, |
| "grad_norm": 2.859375, |
| "kd_loss": 0.14453125, |
| "learning_rate": 9.633218339615433e-06, |
| "loss": 0.1935, |
| "step": 150, |
| "student_loss": 0.0032606760505586863, |
| "teacher_loss": 0.0031273479107767344 |
| }, |
| { |
| "epoch": 0.04469434832756632, |
| "grad_norm": 3.578125, |
| "kd_loss": 0.12451171875, |
| "learning_rate": 9.602172669838721e-06, |
| "loss": 0.2199, |
| "step": 155, |
| "student_loss": 0.0088576041162014, |
| "teacher_loss": 0.0016166985733434558 |
| }, |
| { |
| "epoch": 0.0461361014994233, |
| "grad_norm": 4.09375, |
| "kd_loss": 0.1708984375, |
| "learning_rate": 9.569920160040815e-06, |
| "loss": 0.2018, |
| "step": 160, |
| "student_loss": 0.13294154405593872, |
| "teacher_loss": 0.03791189566254616 |
| }, |
| { |
| "epoch": 0.04757785467128028, |
| "grad_norm": 3.921875, |
| "kd_loss": 0.138671875, |
| "learning_rate": 9.536469267883432e-06, |
| "loss": 0.208, |
| "step": 165, |
| "student_loss": 0.002772042527794838, |
| "teacher_loss": 0.005522818770259619 |
| }, |
| { |
| "epoch": 0.049019607843137254, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.126953125, |
| "learning_rate": 9.501828765283295e-06, |
| "loss": 0.1962, |
| "step": 170, |
| "student_loss": 0.003656906308606267, |
| "teacher_loss": 0.0018494409741833806 |
| }, |
| { |
| "epoch": 0.05046136101499423, |
| "grad_norm": 3.515625, |
| "kd_loss": 0.12451171875, |
| "learning_rate": 9.466007736111846e-06, |
| "loss": 0.1935, |
| "step": 175, |
| "student_loss": 0.017079656943678856, |
| "teacher_loss": 0.0010717228287830949 |
| }, |
| { |
| "epoch": 0.05190311418685121, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.11669921875, |
| "learning_rate": 9.429015573813163e-06, |
| "loss": 0.1861, |
| "step": 180, |
| "student_loss": 0.003456867765635252, |
| "teacher_loss": 0.0010596277425065637 |
| }, |
| { |
| "epoch": 0.05334486735870819, |
| "grad_norm": 4.0625, |
| "kd_loss": 0.1455078125, |
| "learning_rate": 9.390861978940687e-06, |
| "loss": 0.1921, |
| "step": 185, |
| "student_loss": 0.31187787652015686, |
| "teacher_loss": 0.0008243238553404808 |
| }, |
| { |
| "epoch": 0.05478662053056517, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.1416015625, |
| "learning_rate": 9.351556956613423e-06, |
| "loss": 0.2044, |
| "step": 190, |
| "student_loss": 0.011734717525541782, |
| "teacher_loss": 0.0015390698099508882 |
| }, |
| { |
| "epoch": 0.056228373702422146, |
| "grad_norm": 4.5, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 9.31111081389227e-06, |
| "loss": 0.1778, |
| "step": 195, |
| "student_loss": 0.05741060897707939, |
| "teacher_loss": 0.0007789382943883538 |
| }, |
| { |
| "epoch": 0.05767012687427912, |
| "grad_norm": 2.28125, |
| "kd_loss": 0.12890625, |
| "learning_rate": 9.269534157077177e-06, |
| "loss": 0.1743, |
| "step": 200, |
| "student_loss": 0.0014264394994825125, |
| "teacher_loss": 0.0006533891428261995 |
| }, |
| { |
| "epoch": 0.0591118800461361, |
| "grad_norm": 4.25, |
| "kd_loss": 0.1376953125, |
| "learning_rate": 9.226837888925813e-06, |
| "loss": 0.1969, |
| "step": 205, |
| "student_loss": 0.0015782959526404738, |
| "teacher_loss": 0.0368424728512764 |
| }, |
| { |
| "epoch": 0.06055363321799308, |
| "grad_norm": 7.5625, |
| "kd_loss": 0.1123046875, |
| "learning_rate": 9.183033205794525e-06, |
| "loss": 0.1836, |
| "step": 210, |
| "student_loss": 0.01342203002423048, |
| "teacher_loss": 0.0011842605890706182 |
| }, |
| { |
| "epoch": 0.061995386389850055, |
| "grad_norm": 3.0, |
| "kd_loss": 0.126953125, |
| "learning_rate": 9.13813159470227e-06, |
| "loss": 0.1824, |
| "step": 215, |
| "student_loss": 0.0014404732501134276, |
| "teacher_loss": 0.0007589462329633534 |
| }, |
| { |
| "epoch": 0.06343713956170703, |
| "grad_norm": 7.125, |
| "kd_loss": 0.1328125, |
| "learning_rate": 9.092144830318357e-06, |
| "loss": 0.21, |
| "step": 220, |
| "student_loss": 0.2732444703578949, |
| "teacher_loss": 0.00960276648402214 |
| }, |
| { |
| "epoch": 0.06487889273356401, |
| "grad_norm": 6.15625, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 9.045084971874738e-06, |
| "loss": 0.1941, |
| "step": 225, |
| "student_loss": 0.2691424489021301, |
| "teacher_loss": 0.003315337933599949 |
| }, |
| { |
| "epoch": 0.06632064590542099, |
| "grad_norm": 2.46875, |
| "kd_loss": 0.10986328125, |
| "learning_rate": 8.99696436000368e-06, |
| "loss": 0.1702, |
| "step": 230, |
| "student_loss": 0.08001423627138138, |
| "teacher_loss": 0.009347192943096161 |
| }, |
| { |
| "epoch": 0.06776239907727798, |
| "grad_norm": 3.734375, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 8.947795613501658e-06, |
| "loss": 0.1778, |
| "step": 235, |
| "student_loss": 0.003426821669563651, |
| "teacher_loss": 0.0008243515621870756 |
| }, |
| { |
| "epoch": 0.06920415224913495, |
| "grad_norm": 3.421875, |
| "kd_loss": 0.10205078125, |
| "learning_rate": 8.897591626020284e-06, |
| "loss": 0.1928, |
| "step": 240, |
| "student_loss": 0.004231320694088936, |
| "teacher_loss": 0.0010777115821838379 |
| }, |
| { |
| "epoch": 0.07064590542099193, |
| "grad_norm": 3.640625, |
| "kd_loss": 0.11181640625, |
| "learning_rate": 8.846365562685178e-06, |
| "loss": 0.1721, |
| "step": 245, |
| "student_loss": 0.003189836163073778, |
| "teacher_loss": 0.0029278292786329985 |
| }, |
| { |
| "epoch": 0.07208765859284891, |
| "grad_norm": 3.28125, |
| "kd_loss": 0.107421875, |
| "learning_rate": 8.794130856643635e-06, |
| "loss": 0.1624, |
| "step": 250, |
| "student_loss": 0.0030161093454807997, |
| "teacher_loss": 0.0015030049253255129 |
| }, |
| { |
| "epoch": 0.07352941176470588, |
| "grad_norm": 2.671875, |
| "kd_loss": 0.1171875, |
| "learning_rate": 8.74090120554202e-06, |
| "loss": 0.181, |
| "step": 255, |
| "student_loss": 0.0010447532404214144, |
| "teacher_loss": 0.0014572968939319253 |
| }, |
| { |
| "epoch": 0.07497116493656286, |
| "grad_norm": 3.03125, |
| "kd_loss": 0.125, |
| "learning_rate": 8.686690567933803e-06, |
| "loss": 0.18, |
| "step": 260, |
| "student_loss": 0.002235305029898882, |
| "teacher_loss": 0.03705403953790665 |
| }, |
| { |
| "epoch": 0.07641291810841984, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 8.63151315961915e-06, |
| "loss": 0.1751, |
| "step": 265, |
| "student_loss": 0.0019888102542608976, |
| "teacher_loss": 0.0012628042604774237 |
| }, |
| { |
| "epoch": 0.07785467128027682, |
| "grad_norm": 4.03125, |
| "kd_loss": 0.103515625, |
| "learning_rate": 8.575383449917103e-06, |
| "loss": 0.1698, |
| "step": 270, |
| "student_loss": 0.009670126251876354, |
| "teacher_loss": 0.0018007074249908328 |
| }, |
| { |
| "epoch": 0.07929642445213379, |
| "grad_norm": 5.0625, |
| "kd_loss": 0.11669921875, |
| "learning_rate": 8.518316157871232e-06, |
| "loss": 0.1792, |
| "step": 275, |
| "student_loss": 0.0027291348669677973, |
| "teacher_loss": 0.03865275904536247 |
| }, |
| { |
| "epoch": 0.08073817762399077, |
| "grad_norm": 6.40625, |
| "kd_loss": 0.12451171875, |
| "learning_rate": 8.460326248389825e-06, |
| "loss": 0.1868, |
| "step": 280, |
| "student_loss": 0.0005779159837402403, |
| "teacher_loss": 0.0004988706787116826 |
| }, |
| { |
| "epoch": 0.08217993079584775, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 8.401428928321607e-06, |
| "loss": 0.1777, |
| "step": 285, |
| "student_loss": 0.00322159961797297, |
| "teacher_loss": 0.0016653644852340221 |
| }, |
| { |
| "epoch": 0.08362168396770472, |
| "grad_norm": 4.15625, |
| "kd_loss": 0.119140625, |
| "learning_rate": 8.341639642468002e-06, |
| "loss": 0.2245, |
| "step": 290, |
| "student_loss": 0.025423452258110046, |
| "teacher_loss": 0.006107364781200886 |
| }, |
| { |
| "epoch": 0.0850634371395617, |
| "grad_norm": 2.21875, |
| "kd_loss": 0.10498046875, |
| "learning_rate": 8.280974069532999e-06, |
| "loss": 0.1742, |
| "step": 295, |
| "student_loss": 0.0032805479131639004, |
| "teacher_loss": 0.002079744590446353 |
| }, |
| { |
| "epoch": 0.08650519031141868, |
| "grad_norm": 4.875, |
| "kd_loss": 0.1220703125, |
| "learning_rate": 8.219448118011687e-06, |
| "loss": 0.1698, |
| "step": 300, |
| "student_loss": 0.05386965721845627, |
| "teacher_loss": 0.0015291464515030384 |
| }, |
| { |
| "epoch": 0.08794694348327567, |
| "grad_norm": 2.703125, |
| "kd_loss": 0.09375, |
| "learning_rate": 8.157077922018537e-06, |
| "loss": 0.1735, |
| "step": 305, |
| "student_loss": 0.007909238338470459, |
| "teacher_loss": 0.0032228778582066298 |
| }, |
| { |
| "epoch": 0.08938869665513265, |
| "grad_norm": 6.375, |
| "kd_loss": 0.091796875, |
| "learning_rate": 8.093879837056486e-06, |
| "loss": 0.1662, |
| "step": 310, |
| "student_loss": 0.0014559343690052629, |
| "teacher_loss": 0.0014570873463526368 |
| }, |
| { |
| "epoch": 0.09083044982698962, |
| "grad_norm": 6.09375, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 8.029870435728018e-06, |
| "loss": 0.1905, |
| "step": 315, |
| "student_loss": 0.13904070854187012, |
| "teacher_loss": 0.00045576939010061324 |
| }, |
| { |
| "epoch": 0.0922722029988466, |
| "grad_norm": 8.75, |
| "kd_loss": 0.091796875, |
| "learning_rate": 7.965066503389264e-06, |
| "loss": 0.1801, |
| "step": 320, |
| "student_loss": 0.0017298327293246984, |
| "teacher_loss": 0.001036101020872593 |
| }, |
| { |
| "epoch": 0.09371395617070358, |
| "grad_norm": 5.53125, |
| "kd_loss": 0.15625, |
| "learning_rate": 7.89948503374835e-06, |
| "loss": 0.1636, |
| "step": 325, |
| "student_loss": 0.0033407427836209536, |
| "teacher_loss": 0.02077825367450714 |
| }, |
| { |
| "epoch": 0.09515570934256055, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.10693359375, |
| "learning_rate": 7.833143224409076e-06, |
| "loss": 0.1884, |
| "step": 330, |
| "student_loss": 0.006418874487280846, |
| "teacher_loss": 0.0011637036222964525 |
| }, |
| { |
| "epoch": 0.09659746251441753, |
| "grad_norm": 4.71875, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 7.766058472361154e-06, |
| "loss": 0.1577, |
| "step": 335, |
| "student_loss": 0.0016794800758361816, |
| "teacher_loss": 0.0023754944559186697 |
| }, |
| { |
| "epoch": 0.09803921568627451, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 7.698248369418146e-06, |
| "loss": 0.1589, |
| "step": 340, |
| "student_loss": 0.044694170355796814, |
| "teacher_loss": 0.007826481945812702 |
| }, |
| { |
| "epoch": 0.09948096885813149, |
| "grad_norm": 5.375, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 7.629730697604314e-06, |
| "loss": 0.1807, |
| "step": 345, |
| "student_loss": 0.09194417297840118, |
| "teacher_loss": 0.0007945778197608888 |
| }, |
| { |
| "epoch": 0.10092272202998846, |
| "grad_norm": 5.875, |
| "kd_loss": 0.11865234375, |
| "learning_rate": 7.560523424491595e-06, |
| "loss": 0.1526, |
| "step": 350, |
| "student_loss": 0.005946993827819824, |
| "teacher_loss": 0.0006145219667814672 |
| }, |
| { |
| "epoch": 0.10236447520184544, |
| "grad_norm": 9.1875, |
| "kd_loss": 0.10302734375, |
| "learning_rate": 7.490644698487909e-06, |
| "loss": 0.1627, |
| "step": 355, |
| "student_loss": 0.0015843416331335902, |
| "teacher_loss": 0.0014968032483011484 |
| }, |
| { |
| "epoch": 0.10380622837370242, |
| "grad_norm": 12.625, |
| "kd_loss": 0.10791015625, |
| "learning_rate": 7.420112844078066e-06, |
| "loss": 0.1682, |
| "step": 360, |
| "student_loss": 0.01987134851515293, |
| "teacher_loss": 0.001595525536686182 |
| }, |
| { |
| "epoch": 0.1052479815455594, |
| "grad_norm": 6.5625, |
| "kd_loss": 0.103515625, |
| "learning_rate": 7.348946357018479e-06, |
| "loss": 0.1509, |
| "step": 365, |
| "student_loss": 0.006010106764733791, |
| "teacher_loss": 0.032394833862781525 |
| }, |
| { |
| "epoch": 0.10668973471741638, |
| "grad_norm": 6.6875, |
| "kd_loss": 0.1181640625, |
| "learning_rate": 7.277163899486975e-06, |
| "loss": 0.1623, |
| "step": 370, |
| "student_loss": 0.15845070779323578, |
| "teacher_loss": 0.0004756299313157797 |
| }, |
| { |
| "epoch": 0.10813148788927336, |
| "grad_norm": 4.625, |
| "kd_loss": 0.1240234375, |
| "learning_rate": 7.204784295188959e-06, |
| "loss": 0.1506, |
| "step": 375, |
| "student_loss": 0.10649572312831879, |
| "teacher_loss": 0.02242193929851055 |
| }, |
| { |
| "epoch": 0.10957324106113034, |
| "grad_norm": 3.4375, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 7.1318265244212305e-06, |
| "loss": 0.1752, |
| "step": 380, |
| "student_loss": 0.00281524658203125, |
| "teacher_loss": 0.0015117195434868336 |
| }, |
| { |
| "epoch": 0.11101499423298732, |
| "grad_norm": 2.359375, |
| "kd_loss": 0.10205078125, |
| "learning_rate": 7.05830971909472e-06, |
| "loss": 0.1547, |
| "step": 385, |
| "student_loss": 0.0016335069667547941, |
| "teacher_loss": 0.0012311713071539998 |
| }, |
| { |
| "epoch": 0.11245674740484429, |
| "grad_norm": 4.125, |
| "kd_loss": 0.10205078125, |
| "learning_rate": 6.9842531577174865e-06, |
| "loss": 0.1538, |
| "step": 390, |
| "student_loss": 0.0012884392635896802, |
| "teacher_loss": 0.001418368425220251 |
| }, |
| { |
| "epoch": 0.11389850057670127, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 6.9096762603392595e-06, |
| "loss": 0.1698, |
| "step": 395, |
| "student_loss": 0.0018499374855309725, |
| "teacher_loss": 0.0013690440682694316 |
| }, |
| { |
| "epoch": 0.11534025374855825, |
| "grad_norm": 6.53125, |
| "kd_loss": 0.10546875, |
| "learning_rate": 6.834598583458862e-06, |
| "loss": 0.16, |
| "step": 400, |
| "student_loss": 0.0014036521315574646, |
| "teacher_loss": 0.00040830764919519424 |
| }, |
| { |
| "epoch": 0.11678200692041522, |
| "grad_norm": 4.71875, |
| "kd_loss": 0.0888671875, |
| "learning_rate": 6.7590398148958625e-06, |
| "loss": 0.1718, |
| "step": 405, |
| "student_loss": 0.10261467099189758, |
| "teacher_loss": 0.0006754493806511164 |
| }, |
| { |
| "epoch": 0.1182237600922722, |
| "grad_norm": 4.3125, |
| "kd_loss": 0.171875, |
| "learning_rate": 6.6830197686277945e-06, |
| "loss": 0.1878, |
| "step": 410, |
| "student_loss": 0.4882833659648895, |
| "teacher_loss": 0.00981883890926838 |
| }, |
| { |
| "epoch": 0.11966551326412918, |
| "grad_norm": 2.8125, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 6.6065583795942625e-06, |
| "loss": 0.182, |
| "step": 415, |
| "student_loss": 0.03729023039340973, |
| "teacher_loss": 0.0042837257497012615 |
| }, |
| { |
| "epoch": 0.12110726643598616, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.1064453125, |
| "learning_rate": 6.52967569846937e-06, |
| "loss": 0.1607, |
| "step": 420, |
| "student_loss": 0.05881139263510704, |
| "teacher_loss": 0.024456653743982315 |
| }, |
| { |
| "epoch": 0.12254901960784313, |
| "grad_norm": 4.25, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 6.452391886403767e-06, |
| "loss": 0.1674, |
| "step": 425, |
| "student_loss": 0.05037780851125717, |
| "teacher_loss": 0.0040146904066205025 |
| }, |
| { |
| "epoch": 0.12399077277970011, |
| "grad_norm": 4.75, |
| "kd_loss": 0.1748046875, |
| "learning_rate": 6.374727209737743e-06, |
| "loss": 0.1766, |
| "step": 430, |
| "student_loss": 0.00238221138715744, |
| "teacher_loss": 0.06439146399497986 |
| }, |
| { |
| "epoch": 0.1254325259515571, |
| "grad_norm": 5.53125, |
| "kd_loss": 0.1015625, |
| "learning_rate": 6.296702034686726e-06, |
| "loss": 0.1714, |
| "step": 435, |
| "student_loss": 0.002659996272996068, |
| "teacher_loss": 0.0022907655220478773 |
| }, |
| { |
| "epoch": 0.12687427912341406, |
| "grad_norm": 3.078125, |
| "kd_loss": 0.1845703125, |
| "learning_rate": 6.218336822000598e-06, |
| "loss": 0.1775, |
| "step": 440, |
| "student_loss": 0.46329638361930847, |
| "teacher_loss": 0.008188321255147457 |
| }, |
| { |
| "epoch": 0.12831603229527105, |
| "grad_norm": 3.953125, |
| "kd_loss": 0.1259765625, |
| "learning_rate": 6.139652121598219e-06, |
| "loss": 0.1769, |
| "step": 445, |
| "student_loss": 0.0006292742909863591, |
| "teacher_loss": 0.02016839198768139 |
| }, |
| { |
| "epoch": 0.12975778546712802, |
| "grad_norm": 3.53125, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 6.060668567178561e-06, |
| "loss": 0.1663, |
| "step": 450, |
| "student_loss": 0.002717025112360716, |
| "teacher_loss": 0.0016874197172001004 |
| }, |
| { |
| "epoch": 0.131199538638985, |
| "grad_norm": 2.671875, |
| "kd_loss": 0.087890625, |
| "learning_rate": 5.981406870809889e-06, |
| "loss": 0.1748, |
| "step": 455, |
| "student_loss": 0.012300008907914162, |
| "teacher_loss": 0.0016890015685930848 |
| }, |
| { |
| "epoch": 0.13264129181084197, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.10791015625, |
| "learning_rate": 5.9018878174983674e-06, |
| "loss": 0.17, |
| "step": 460, |
| "student_loss": 0.03240777552127838, |
| "teacher_loss": 0.0010722745209932327 |
| }, |
| { |
| "epoch": 0.13408304498269896, |
| "grad_norm": 2.765625, |
| "kd_loss": 0.1328125, |
| "learning_rate": 5.822132259737565e-06, |
| "loss": 0.1858, |
| "step": 465, |
| "student_loss": 0.0023128872271627188, |
| "teacher_loss": 0.0006816239329054952 |
| }, |
| { |
| "epoch": 0.13552479815455595, |
| "grad_norm": 5.3125, |
| "kd_loss": 0.09521484375, |
| "learning_rate": 5.742161112040237e-06, |
| "loss": 0.1887, |
| "step": 470, |
| "student_loss": 0.0013243159046396613, |
| "teacher_loss": 0.0008191668312065303 |
| }, |
| { |
| "epoch": 0.13696655132641292, |
| "grad_norm": 4.34375, |
| "kd_loss": 0.12890625, |
| "learning_rate": 5.661995345453867e-06, |
| "loss": 0.1479, |
| "step": 475, |
| "student_loss": 0.0022922754287719727, |
| "teacher_loss": 0.0007053640438243747 |
| }, |
| { |
| "epoch": 0.1384083044982699, |
| "grad_norm": 4.1875, |
| "kd_loss": 0.10888671875, |
| "learning_rate": 5.581655982061367e-06, |
| "loss": 0.2052, |
| "step": 480, |
| "student_loss": 0.016067378222942352, |
| "teacher_loss": 0.0467948317527771 |
| }, |
| { |
| "epoch": 0.13985005767012687, |
| "grad_norm": 4.625, |
| "kd_loss": 0.09521484375, |
| "learning_rate": 5.501164089468406e-06, |
| "loss": 0.1535, |
| "step": 485, |
| "student_loss": 0.001838831347413361, |
| "teacher_loss": 0.0018535954877734184 |
| }, |
| { |
| "epoch": 0.14129181084198386, |
| "grad_norm": 4.3125, |
| "kd_loss": 0.099609375, |
| "learning_rate": 5.4205407752787884e-06, |
| "loss": 0.1702, |
| "step": 490, |
| "student_loss": 0.0021060549188405275, |
| "teacher_loss": 0.0013811348471790552 |
| }, |
| { |
| "epoch": 0.14273356401384082, |
| "grad_norm": 5.0, |
| "kd_loss": 0.12109375, |
| "learning_rate": 5.339807181559359e-06, |
| "loss": 0.1698, |
| "step": 495, |
| "student_loss": 0.004670781549066305, |
| "teacher_loss": 0.0008499641553498805 |
| }, |
| { |
| "epoch": 0.14417531718569782, |
| "grad_norm": 7.28125, |
| "kd_loss": 0.1005859375, |
| "learning_rate": 5.258984479295853e-06, |
| "loss": 0.1663, |
| "step": 500, |
| "student_loss": 0.0009078571456484497, |
| "teacher_loss": 0.0008732817368581891 |
| }, |
| { |
| "epoch": 0.14561707035755478, |
| "grad_norm": 4.8125, |
| "kd_loss": 0.09521484375, |
| "learning_rate": 5.1780938628411795e-06, |
| "loss": 0.1857, |
| "step": 505, |
| "student_loss": 0.002319552004337311, |
| "teacher_loss": 0.0009417013498023152 |
| }, |
| { |
| "epoch": 0.14705882352941177, |
| "grad_norm": 5.5, |
| "kd_loss": 0.09130859375, |
| "learning_rate": 5.097156544357567e-06, |
| "loss": 0.168, |
| "step": 510, |
| "student_loss": 0.001904567121528089, |
| "teacher_loss": 0.0011415554909035563 |
| }, |
| { |
| "epoch": 0.14850057670126873, |
| "grad_norm": 4.9375, |
| "kd_loss": 0.11474609375, |
| "learning_rate": 5.016193748254045e-06, |
| "loss": 0.1561, |
| "step": 515, |
| "student_loss": 0.004430091939866543, |
| "teacher_loss": 0.000705283775459975 |
| }, |
| { |
| "epoch": 0.14994232987312572, |
| "grad_norm": 4.0, |
| "kd_loss": 0.1005859375, |
| "learning_rate": 4.935226705620699e-06, |
| "loss": 0.1742, |
| "step": 520, |
| "student_loss": 0.4650050103664398, |
| "teacher_loss": 0.011486685834825039 |
| }, |
| { |
| "epoch": 0.1513840830449827, |
| "grad_norm": 2.28125, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 4.8542766486612035e-06, |
| "loss": 0.1568, |
| "step": 525, |
| "student_loss": 0.004688178189098835, |
| "teacher_loss": 0.0005817305063828826 |
| }, |
| { |
| "epoch": 0.15282583621683968, |
| "grad_norm": 6.21875, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 4.773364805125025e-06, |
| "loss": 0.1569, |
| "step": 530, |
| "student_loss": 0.002902889158576727, |
| "teacher_loss": 0.0036108619533479214 |
| }, |
| { |
| "epoch": 0.15426758938869667, |
| "grad_norm": 2.8125, |
| "kd_loss": 0.0947265625, |
| "learning_rate": 4.6925123927408265e-06, |
| "loss": 0.146, |
| "step": 535, |
| "student_loss": 0.004958340898156166, |
| "teacher_loss": 0.0009314365452155471 |
| }, |
| { |
| "epoch": 0.15570934256055363, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 4.611740613652485e-06, |
| "loss": 0.1485, |
| "step": 540, |
| "student_loss": 0.022316506132483482, |
| "teacher_loss": 0.0009606878156773746 |
| }, |
| { |
| "epoch": 0.15715109573241062, |
| "grad_norm": 5.90625, |
| "kd_loss": 0.095703125, |
| "learning_rate": 4.531070648859186e-06, |
| "loss": 0.171, |
| "step": 545, |
| "student_loss": 0.005919112823903561, |
| "teacher_loss": 0.016547029837965965 |
| }, |
| { |
| "epoch": 0.15859284890426759, |
| "grad_norm": 4.375, |
| "kd_loss": 0.1123046875, |
| "learning_rate": 4.450523652661086e-06, |
| "loss": 0.142, |
| "step": 550, |
| "student_loss": 0.0007885328959673643, |
| "teacher_loss": 0.0045303236693143845 |
| }, |
| { |
| "epoch": 0.16003460207612458, |
| "grad_norm": 4.03125, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 4.370120747111956e-06, |
| "loss": 0.1566, |
| "step": 555, |
| "student_loss": 0.0045122369192540646, |
| "teacher_loss": 0.0012258175993338227 |
| }, |
| { |
| "epoch": 0.16147635524798154, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 4.289883016480291e-06, |
| "loss": 0.1694, |
| "step": 560, |
| "student_loss": 0.038154710084199905, |
| "teacher_loss": 0.00046423348248936236 |
| }, |
| { |
| "epoch": 0.16291810841983853, |
| "grad_norm": 3.34375, |
| "kd_loss": 0.109375, |
| "learning_rate": 4.209831501720328e-06, |
| "loss": 0.1557, |
| "step": 565, |
| "student_loss": 0.018078487366437912, |
| "teacher_loss": 0.021091489121317863 |
| }, |
| { |
| "epoch": 0.1643598615916955, |
| "grad_norm": 5.21875, |
| "kd_loss": 0.1689453125, |
| "learning_rate": 4.129987194954421e-06, |
| "loss": 0.17, |
| "step": 570, |
| "student_loss": 0.15178009867668152, |
| "teacher_loss": 0.0086033521220088 |
| }, |
| { |
| "epoch": 0.16580161476355249, |
| "grad_norm": 2.890625, |
| "kd_loss": 0.08544921875, |
| "learning_rate": 4.050371033968216e-06, |
| "loss": 0.1651, |
| "step": 575, |
| "student_loss": 0.0016716659301891923, |
| "teacher_loss": 0.0008001797832548618 |
| }, |
| { |
| "epoch": 0.16724336793540945, |
| "grad_norm": 4.1875, |
| "kd_loss": 0.236328125, |
| "learning_rate": 3.9710038967200825e-06, |
| "loss": 0.1443, |
| "step": 580, |
| "student_loss": 0.004638470709323883, |
| "teacher_loss": 0.006588623858988285 |
| }, |
| { |
| "epoch": 0.16868512110726644, |
| "grad_norm": 2.890625, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 3.89190659586623e-06, |
| "loss": 0.1551, |
| "step": 585, |
| "student_loss": 0.00187311926856637, |
| "teacher_loss": 0.0005596915725618601 |
| }, |
| { |
| "epoch": 0.1701268742791234, |
| "grad_norm": 6.65625, |
| "kd_loss": 0.091796875, |
| "learning_rate": 3.8130998733029517e-06, |
| "loss": 0.1722, |
| "step": 590, |
| "student_loss": 0.017516393214464188, |
| "teacher_loss": 0.002362610539421439 |
| }, |
| { |
| "epoch": 0.1715686274509804, |
| "grad_norm": 2.234375, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 3.734604394727419e-06, |
| "loss": 0.1736, |
| "step": 595, |
| "student_loss": 0.0015100985765457153, |
| "teacher_loss": 0.0012370356125757098 |
| }, |
| { |
| "epoch": 0.17301038062283736, |
| "grad_norm": 5.375, |
| "kd_loss": 0.1064453125, |
| "learning_rate": 3.656440744218464e-06, |
| "loss": 0.1822, |
| "step": 600, |
| "student_loss": 0.3471376895904541, |
| "teacher_loss": 0.006922336760908365 |
| }, |
| { |
| "epoch": 0.17445213379469435, |
| "grad_norm": 5.65625, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 3.578629418838757e-06, |
| "loss": 0.1706, |
| "step": 605, |
| "student_loss": 0.09560892730951309, |
| "teacher_loss": 0.04084807634353638 |
| }, |
| { |
| "epoch": 0.17589388696655134, |
| "grad_norm": 3.609375, |
| "kd_loss": 0.10888671875, |
| "learning_rate": 3.5011908232598124e-06, |
| "loss": 0.1418, |
| "step": 610, |
| "student_loss": 0.0035140912514179945, |
| "teacher_loss": 0.0005105194286443293 |
| }, |
| { |
| "epoch": 0.1773356401384083, |
| "grad_norm": 3.375, |
| "kd_loss": 0.10498046875, |
| "learning_rate": 3.4241452644112085e-06, |
| "loss": 0.1453, |
| "step": 615, |
| "student_loss": 0.0014288002857938409, |
| "teacher_loss": 0.001070382189936936 |
| }, |
| { |
| "epoch": 0.1787773933102653, |
| "grad_norm": 2.578125, |
| "kd_loss": 0.1416015625, |
| "learning_rate": 3.3475129461554567e-06, |
| "loss": 0.1677, |
| "step": 620, |
| "student_loss": 0.0047634500078856945, |
| "teacher_loss": 0.009211473166942596 |
| }, |
| { |
| "epoch": 0.18021914648212226, |
| "grad_norm": 3.578125, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 3.271313963989886e-06, |
| "loss": 0.1556, |
| "step": 625, |
| "student_loss": 0.019517898559570312, |
| "teacher_loss": 0.004466219339519739 |
| }, |
| { |
| "epoch": 0.18166089965397925, |
| "grad_norm": 4.21875, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 3.195568299776945e-06, |
| "loss": 0.1587, |
| "step": 630, |
| "student_loss": 0.09341763705015182, |
| "teacher_loss": 0.0017769263358786702 |
| }, |
| { |
| "epoch": 0.1831026528258362, |
| "grad_norm": 4.21875, |
| "kd_loss": 0.09130859375, |
| "learning_rate": 3.1202958165043053e-06, |
| "loss": 0.1877, |
| "step": 635, |
| "student_loss": 0.0012313922634348273, |
| "teacher_loss": 0.0007036713068373501 |
| }, |
| { |
| "epoch": 0.1845444059976932, |
| "grad_norm": 5.84375, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 3.045516253076137e-06, |
| "loss": 0.1654, |
| "step": 640, |
| "student_loss": 0.001555976108647883, |
| "teacher_loss": 0.0010528129059821367 |
| }, |
| { |
| "epoch": 0.18598615916955016, |
| "grad_norm": 7.96875, |
| "kd_loss": 0.09228515625, |
| "learning_rate": 2.9712492191369245e-06, |
| "loss": 0.1564, |
| "step": 645, |
| "student_loss": 0.0033667683601379395, |
| "teacher_loss": 0.0009755496867001057 |
| }, |
| { |
| "epoch": 0.18742791234140715, |
| "grad_norm": 2.921875, |
| "kd_loss": 0.11376953125, |
| "learning_rate": 2.8975141899291777e-06, |
| "loss": 0.1552, |
| "step": 650, |
| "student_loss": 0.001696955063380301, |
| "teacher_loss": 0.0012513434048742056 |
| }, |
| { |
| "epoch": 0.18886966551326412, |
| "grad_norm": 3.234375, |
| "kd_loss": 0.08544921875, |
| "learning_rate": 2.8243305011863843e-06, |
| "loss": 0.1481, |
| "step": 655, |
| "student_loss": 0.027264071628451347, |
| "teacher_loss": 0.0005043753772042692 |
| }, |
| { |
| "epoch": 0.1903114186851211, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 2.751717344062552e-06, |
| "loss": 0.1658, |
| "step": 660, |
| "student_loss": 0.006026037037372589, |
| "teacher_loss": 0.0037035837303847075 |
| }, |
| { |
| "epoch": 0.19175317185697807, |
| "grad_norm": 3.96875, |
| "kd_loss": 0.11181640625, |
| "learning_rate": 2.6796937600996587e-06, |
| "loss": 0.1585, |
| "step": 665, |
| "student_loss": 0.0023006678093224764, |
| "teacher_loss": 0.0006673650932498276 |
| }, |
| { |
| "epoch": 0.19319492502883506, |
| "grad_norm": 4.125, |
| "kd_loss": 0.08837890625, |
| "learning_rate": 2.6082786362343377e-06, |
| "loss": 0.1818, |
| "step": 670, |
| "student_loss": 0.0015634546289220452, |
| "teacher_loss": 0.0005979883135296404 |
| }, |
| { |
| "epoch": 0.19463667820069205, |
| "grad_norm": 3.59375, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 2.5374906998451094e-06, |
| "loss": 0.1598, |
| "step": 675, |
| "student_loss": 0.0016033351421356201, |
| "teacher_loss": 0.001516613527201116 |
| }, |
| { |
| "epoch": 0.19607843137254902, |
| "grad_norm": 3.625, |
| "kd_loss": 0.162109375, |
| "learning_rate": 2.467348513841447e-06, |
| "loss": 0.1566, |
| "step": 680, |
| "student_loss": 0.15181653201580048, |
| "teacher_loss": 0.04114415496587753 |
| }, |
| { |
| "epoch": 0.197520184544406, |
| "grad_norm": 2.703125, |
| "kd_loss": 0.15625, |
| "learning_rate": 2.3978704717959777e-06, |
| "loss": 0.154, |
| "step": 685, |
| "student_loss": 0.0007377453148365021, |
| "teacher_loss": 0.0339120589196682 |
| }, |
| { |
| "epoch": 0.19896193771626297, |
| "grad_norm": 3.15625, |
| "kd_loss": 0.09521484375, |
| "learning_rate": 2.329074793121085e-06, |
| "loss": 0.1582, |
| "step": 690, |
| "student_loss": 0.0044479165226221085, |
| "teacher_loss": 0.012265580706298351 |
| }, |
| { |
| "epoch": 0.20040369088811996, |
| "grad_norm": 3.234375, |
| "kd_loss": 0.10693359375, |
| "learning_rate": 2.260979518291186e-06, |
| "loss": 0.1724, |
| "step": 695, |
| "student_loss": 0.015444566495716572, |
| "teacher_loss": 0.010763188824057579 |
| }, |
| { |
| "epoch": 0.20184544405997693, |
| "grad_norm": 3.75, |
| "kd_loss": 0.091796875, |
| "learning_rate": 2.1936025041119268e-06, |
| "loss": 0.1753, |
| "step": 700, |
| "student_loss": 0.0019369354704394937, |
| "teacher_loss": 0.0009062191820703447 |
| }, |
| { |
| "epoch": 0.20328719723183392, |
| "grad_norm": 3.296875, |
| "kd_loss": 0.1044921875, |
| "learning_rate": 2.1269614190375477e-06, |
| "loss": 0.1584, |
| "step": 705, |
| "student_loss": 0.001297777402214706, |
| "teacher_loss": 0.0018579652532935143 |
| }, |
| { |
| "epoch": 0.20472895040369088, |
| "grad_norm": 3.75, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 2.061073738537635e-06, |
| "loss": 0.1901, |
| "step": 710, |
| "student_loss": 0.08087821304798126, |
| "teacher_loss": 0.004622929729521275 |
| }, |
| { |
| "epoch": 0.20617070357554787, |
| "grad_norm": 4.1875, |
| "kd_loss": 0.08935546875, |
| "learning_rate": 1.9959567405144825e-06, |
| "loss": 0.1863, |
| "step": 715, |
| "student_loss": 0.009472950361669064, |
| "teacher_loss": 0.007570087444037199 |
| }, |
| { |
| "epoch": 0.20761245674740483, |
| "grad_norm": 4.34375, |
| "kd_loss": 0.11767578125, |
| "learning_rate": 1.931627500772263e-06, |
| "loss": 0.1746, |
| "step": 720, |
| "student_loss": 0.001279592514038086, |
| "teacher_loss": 0.004464911296963692 |
| }, |
| { |
| "epoch": 0.20905420991926182, |
| "grad_norm": 2.25, |
| "kd_loss": 0.10888671875, |
| "learning_rate": 1.8681028885391905e-06, |
| "loss": 0.1528, |
| "step": 725, |
| "student_loss": 0.0024647831451147795, |
| "teacher_loss": 0.0011802453082054853 |
| }, |
| { |
| "epoch": 0.2104959630911188, |
| "grad_norm": 2.671875, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 1.8053995620438625e-06, |
| "loss": 0.152, |
| "step": 730, |
| "student_loss": 0.04315745085477829, |
| "teacher_loss": 0.00156076205894351 |
| }, |
| { |
| "epoch": 0.21193771626297578, |
| "grad_norm": 4.78125, |
| "kd_loss": 0.10498046875, |
| "learning_rate": 1.743533964146924e-06, |
| "loss": 0.1704, |
| "step": 735, |
| "student_loss": 0.0016925001982599497, |
| "teacher_loss": 0.0004609136376529932 |
| }, |
| { |
| "epoch": 0.21337946943483277, |
| "grad_norm": 3.9375, |
| "kd_loss": 0.1142578125, |
| "learning_rate": 1.6825223180292138e-06, |
| "loss": 0.1432, |
| "step": 740, |
| "student_loss": 0.012965809553861618, |
| "teacher_loss": 0.0004022814682684839 |
| }, |
| { |
| "epoch": 0.21482122260668973, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.12890625, |
| "learning_rate": 1.6223806229375182e-06, |
| "loss": 0.1491, |
| "step": 745, |
| "student_loss": 0.1358025223016739, |
| "teacher_loss": 0.02106391452252865 |
| }, |
| { |
| "epoch": 0.21626297577854672, |
| "grad_norm": 4.4375, |
| "kd_loss": 0.11328125, |
| "learning_rate": 1.563124649989043e-06, |
| "loss": 0.1605, |
| "step": 750, |
| "student_loss": 0.10271821916103363, |
| "teacher_loss": 0.004584586247801781 |
| }, |
| { |
| "epoch": 0.2177047289504037, |
| "grad_norm": 4.28125, |
| "kd_loss": 0.091796875, |
| "learning_rate": 1.5047699380357134e-06, |
| "loss": 0.1681, |
| "step": 755, |
| "student_loss": 0.1378186047077179, |
| "teacher_loss": 0.006503281649202108 |
| }, |
| { |
| "epoch": 0.21914648212226068, |
| "grad_norm": 9.5, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 1.4473317895893773e-06, |
| "loss": 0.16, |
| "step": 760, |
| "student_loss": 0.4880536198616028, |
| "teacher_loss": 0.00078756851144135 |
| }, |
| { |
| "epoch": 0.22058823529411764, |
| "grad_norm": 6.96875, |
| "kd_loss": 0.11328125, |
| "learning_rate": 1.39082526680899e-06, |
| "loss": 0.1728, |
| "step": 765, |
| "student_loss": 0.07314120978116989, |
| "teacher_loss": 0.0007181121036410332 |
| }, |
| { |
| "epoch": 0.22202998846597463, |
| "grad_norm": 2.65625, |
| "kd_loss": 0.09912109375, |
| "learning_rate": 1.3352651875508204e-06, |
| "loss": 0.1513, |
| "step": 770, |
| "student_loss": 0.004254591651260853, |
| "teacher_loss": 0.0007984668482095003 |
| }, |
| { |
| "epoch": 0.2234717416378316, |
| "grad_norm": 3.640625, |
| "kd_loss": 0.1044921875, |
| "learning_rate": 1.2806661214827286e-06, |
| "loss": 0.1587, |
| "step": 775, |
| "student_loss": 0.002741985023021698, |
| "teacher_loss": 0.0007013682625256479 |
| }, |
| { |
| "epoch": 0.22491349480968859, |
| "grad_norm": 4.53125, |
| "kd_loss": 0.09765625, |
| "learning_rate": 1.2270423862635188e-06, |
| "loss": 0.1708, |
| "step": 780, |
| "student_loss": 0.0015033041127026081, |
| "teacher_loss": 0.0006990543915890157 |
| }, |
| { |
| "epoch": 0.22635524798154555, |
| "grad_norm": 3.90625, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 1.1744080437883859e-06, |
| "loss": 0.1409, |
| "step": 785, |
| "student_loss": 0.001736114383675158, |
| "teacher_loss": 0.001990710385143757 |
| }, |
| { |
| "epoch": 0.22779700115340254, |
| "grad_norm": 9.25, |
| "kd_loss": 0.11083984375, |
| "learning_rate": 1.1227768965014246e-06, |
| "loss": 0.1804, |
| "step": 790, |
| "student_loss": 0.03133748471736908, |
| "teacher_loss": 0.008059236221015453 |
| }, |
| { |
| "epoch": 0.2292387543252595, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.11474609375, |
| "learning_rate": 1.0721624837761768e-06, |
| "loss": 0.1703, |
| "step": 795, |
| "student_loss": 0.005942783784121275, |
| "teacher_loss": 0.0006947139045223594 |
| }, |
| { |
| "epoch": 0.2306805074971165, |
| "grad_norm": 3.921875, |
| "kd_loss": 0.166015625, |
| "learning_rate": 1.0225780783651689e-06, |
| "loss": 0.1879, |
| "step": 800, |
| "student_loss": 0.05473716929554939, |
| "teacher_loss": 0.03766282647848129 |
| }, |
| { |
| "epoch": 0.23212226066897348, |
| "grad_norm": 4.34375, |
| "kd_loss": 0.08642578125, |
| "learning_rate": 9.740366829193587e-07, |
| "loss": 0.1824, |
| "step": 805, |
| "student_loss": 0.001553440117277205, |
| "teacher_loss": 0.0012171101989224553 |
| }, |
| { |
| "epoch": 0.23356401384083045, |
| "grad_norm": 7.34375, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 9.265510265784189e-07, |
| "loss": 0.1771, |
| "step": 810, |
| "student_loss": 0.0017513898201286793, |
| "teacher_loss": 0.0006000488647259772 |
| }, |
| { |
| "epoch": 0.23500576701268744, |
| "grad_norm": 5.46875, |
| "kd_loss": 0.09912109375, |
| "learning_rate": 8.801335616327378e-07, |
| "loss": 0.1664, |
| "step": 815, |
| "student_loss": 0.007318615913391113, |
| "teacher_loss": 0.010634765028953552 |
| }, |
| { |
| "epoch": 0.2364475201845444, |
| "grad_norm": 3.90625, |
| "kd_loss": 0.11474609375, |
| "learning_rate": 8.347964602580245e-07, |
| "loss": 0.1615, |
| "step": 820, |
| "student_loss": 0.04161018878221512, |
| "teacher_loss": 0.001969601958990097 |
| }, |
| { |
| "epoch": 0.2378892733564014, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.1142578125, |
| "learning_rate": 7.905516113233652e-07, |
| "loss": 0.1532, |
| "step": 825, |
| "student_loss": 0.000943031394854188, |
| "teacher_loss": 0.020420216023921967 |
| }, |
| { |
| "epoch": 0.23933102652825836, |
| "grad_norm": 2.875, |
| "kd_loss": 0.09521484375, |
| "learning_rate": 7.474106172735746e-07, |
| "loss": 0.1601, |
| "step": 830, |
| "student_loss": 0.018866391852498055, |
| "teacher_loss": 0.0037704347632825375 |
| }, |
| { |
| "epoch": 0.24077277970011535, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.0888671875, |
| "learning_rate": 7.053847910866513e-07, |
| "loss": 0.1552, |
| "step": 835, |
| "student_loss": 0.12261331081390381, |
| "teacher_loss": 0.005213484168052673 |
| }, |
| { |
| "epoch": 0.2422145328719723, |
| "grad_norm": 2.84375, |
| "kd_loss": 0.11767578125, |
| "learning_rate": 6.644851533071556e-07, |
| "loss": 0.1478, |
| "step": 840, |
| "student_loss": 0.0071019199676811695, |
| "teacher_loss": 0.0005135077517479658 |
| }, |
| { |
| "epoch": 0.2436562860438293, |
| "grad_norm": 4.6875, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 6.24722429156251e-07, |
| "loss": 0.228, |
| "step": 845, |
| "student_loss": 0.0022525617387145758, |
| "teacher_loss": 0.0012849880149587989 |
| }, |
| { |
| "epoch": 0.24509803921568626, |
| "grad_norm": 6.5625, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 5.861070457192081e-07, |
| "loss": 0.1695, |
| "step": 850, |
| "student_loss": 0.06699959933757782, |
| "teacher_loss": 0.0007787467329762876 |
| }, |
| { |
| "epoch": 0.24653979238754326, |
| "grad_norm": 2.46875, |
| "kd_loss": 0.10546875, |
| "learning_rate": 5.486491292110796e-07, |
| "loss": 0.1498, |
| "step": 855, |
| "student_loss": 0.0011905976571142673, |
| "teacher_loss": 0.0006796009838581085 |
| }, |
| { |
| "epoch": 0.24798154555940022, |
| "grad_norm": 3.40625, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 5.123585023212785e-07, |
| "loss": 0.1846, |
| "step": 860, |
| "student_loss": 0.005176758859306574, |
| "teacher_loss": 0.0015740481903776526 |
| }, |
| { |
| "epoch": 0.2494232987312572, |
| "grad_norm": 3.46875, |
| "kd_loss": 0.095703125, |
| "learning_rate": 4.772446816377408e-07, |
| "loss": 0.1519, |
| "step": 865, |
| "student_loss": 0.0017797622131183743, |
| "teacher_loss": 0.001129323965869844 |
| }, |
| { |
| "epoch": 0.2508650519031142, |
| "grad_norm": 2.515625, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 4.4331687515137614e-07, |
| "loss": 0.1724, |
| "step": 870, |
| "student_loss": 0.0048367022536695, |
| "teacher_loss": 0.0008864006958901882 |
| }, |
| { |
| "epoch": 0.25230680507497116, |
| "grad_norm": 2.390625, |
| "kd_loss": 0.09619140625, |
| "learning_rate": 4.1058397984142405e-07, |
| "loss": 0.1396, |
| "step": 875, |
| "student_loss": 0.0008503241115249693, |
| "teacher_loss": 0.0008235117420554161 |
| }, |
| { |
| "epoch": 0.2537485582468281, |
| "grad_norm": 3.0625, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 3.790545793423761e-07, |
| "loss": 0.1662, |
| "step": 880, |
| "student_loss": 0.0019780993461608887, |
| "teacher_loss": 0.0007267682813107967 |
| }, |
| { |
| "epoch": 0.25519031141868515, |
| "grad_norm": 3.25, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 3.4873694169306915e-07, |
| "loss": 0.1567, |
| "step": 885, |
| "student_loss": 0.020344872027635574, |
| "teacher_loss": 0.056762393563985825 |
| }, |
| { |
| "epoch": 0.2566320645905421, |
| "grad_norm": 2.796875, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 3.196390171685343e-07, |
| "loss": 0.1636, |
| "step": 890, |
| "student_loss": 0.001583437086082995, |
| "teacher_loss": 0.00122374901548028 |
| }, |
| { |
| "epoch": 0.25807381776239907, |
| "grad_norm": 4.09375, |
| "kd_loss": 0.11083984375, |
| "learning_rate": 2.917684361951728e-07, |
| "loss": 0.1583, |
| "step": 895, |
| "student_loss": 0.11338726431131363, |
| "teacher_loss": 0.006426448002457619 |
| }, |
| { |
| "epoch": 0.25951557093425603, |
| "grad_norm": 3.34375, |
| "kd_loss": 0.1201171875, |
| "learning_rate": 2.65132507349814e-07, |
| "loss": 0.1934, |
| "step": 900, |
| "student_loss": 0.0020212531089782715, |
| "teacher_loss": 0.03192909434437752 |
| }, |
| { |
| "epoch": 0.26095732410611305, |
| "grad_norm": 3.3125, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 2.397382154431621e-07, |
| "loss": 0.1627, |
| "step": 905, |
| "student_loss": 0.0032915188930928707, |
| "teacher_loss": 0.0014988789334893227 |
| }, |
| { |
| "epoch": 0.26239907727797, |
| "grad_norm": 4.0, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 2.1559221968815547e-07, |
| "loss": 0.182, |
| "step": 910, |
| "student_loss": 0.001489490270614624, |
| "teacher_loss": 0.0012580000329762697 |
| }, |
| { |
| "epoch": 0.263840830449827, |
| "grad_norm": 3.390625, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 1.9270085195370048e-07, |
| "loss": 0.143, |
| "step": 915, |
| "student_loss": 0.04323554039001465, |
| "teacher_loss": 0.001784435473382473 |
| }, |
| { |
| "epoch": 0.26528258362168394, |
| "grad_norm": 2.546875, |
| "kd_loss": 0.1318359375, |
| "learning_rate": 1.7107011510424766e-07, |
| "loss": 0.1721, |
| "step": 920, |
| "student_loss": 0.01360052265226841, |
| "teacher_loss": 0.018217744305729866 |
| }, |
| { |
| "epoch": 0.26672433679354096, |
| "grad_norm": 3.71875, |
| "kd_loss": 0.087890625, |
| "learning_rate": 1.5070568142564912e-07, |
| "loss": 0.1489, |
| "step": 925, |
| "student_loss": 0.0011945515871047974, |
| "teacher_loss": 0.0009807685855776072 |
| }, |
| { |
| "epoch": 0.2681660899653979, |
| "grad_norm": 3.40625, |
| "kd_loss": 0.1142578125, |
| "learning_rate": 1.3161289113769405e-07, |
| "loss": 0.1539, |
| "step": 930, |
| "student_loss": 0.055781442672014236, |
| "teacher_loss": 0.0011405627010390162 |
| }, |
| { |
| "epoch": 0.2696078431372549, |
| "grad_norm": 2.84375, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 1.1379675099373489e-07, |
| "loss": 0.1501, |
| "step": 935, |
| "student_loss": 0.005637112073600292, |
| "teacher_loss": 0.002441459335386753 |
| }, |
| { |
| "epoch": 0.2710495963091119, |
| "grad_norm": 5.625, |
| "kd_loss": 0.18359375, |
| "learning_rate": 9.726193296774767e-08, |
| "loss": 0.1684, |
| "step": 940, |
| "student_loss": 0.011978531256318092, |
| "teacher_loss": 0.009902331046760082 |
| }, |
| { |
| "epoch": 0.27249134948096887, |
| "grad_norm": 5.375, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 8.201277302919086e-08, |
| "loss": 0.1661, |
| "step": 945, |
| "student_loss": 0.12388397753238678, |
| "teacher_loss": 0.0019162542885169387 |
| }, |
| { |
| "epoch": 0.27393310265282583, |
| "grad_norm": 3.046875, |
| "kd_loss": 0.12451171875, |
| "learning_rate": 6.805327000596995e-08, |
| "loss": 0.1539, |
| "step": 950, |
| "student_loss": 0.004596967715770006, |
| "teacher_loss": 0.0005623517790809274 |
| }, |
| { |
| "epoch": 0.2753748558246828, |
| "grad_norm": 4.3125, |
| "kd_loss": 0.083984375, |
| "learning_rate": 5.538708453581787e-08, |
| "loss": 0.1616, |
| "step": 955, |
| "student_loss": 0.007824474945664406, |
| "teacher_loss": 0.0011617924319580197 |
| }, |
| { |
| "epoch": 0.2768166089965398, |
| "grad_norm": 8.0625, |
| "kd_loss": 0.0947265625, |
| "learning_rate": 4.40175381063529e-08, |
| "loss": 0.1586, |
| "step": 960, |
| "student_loss": 0.0018909722566604614, |
| "teacher_loss": 0.00250077061355114 |
| }, |
| { |
| "epoch": 0.2782583621683968, |
| "grad_norm": 4.40625, |
| "kd_loss": 0.162109375, |
| "learning_rate": 3.394761218407705e-08, |
| "loss": 0.1666, |
| "step": 965, |
| "student_loss": 0.11850693821907043, |
| "teacher_loss": 0.008037789724767208 |
| }, |
| { |
| "epoch": 0.27970011534025374, |
| "grad_norm": 3.1875, |
| "kd_loss": 0.11083984375, |
| "learning_rate": 2.5179947432540376e-08, |
| "loss": 0.1665, |
| "step": 970, |
| "student_loss": 0.0006995275616645813, |
| "teacher_loss": 0.0004927213303744793 |
| }, |
| { |
| "epoch": 0.2811418685121107, |
| "grad_norm": 2.9375, |
| "kd_loss": 0.087890625, |
| "learning_rate": 1.7716843019867646e-08, |
| "loss": 0.1614, |
| "step": 975, |
| "student_loss": 0.11272090673446655, |
| "teacher_loss": 0.002362866187468171 |
| }, |
| { |
| "epoch": 0.2825836216839677, |
| "grad_norm": 4.6875, |
| "kd_loss": 0.0927734375, |
| "learning_rate": 1.156025601584676e-08, |
| "loss": 0.1578, |
| "step": 980, |
| "student_loss": 0.002314644167199731, |
| "teacher_loss": 0.0005237645236775279 |
| }, |
| { |
| "epoch": 0.2840253748558247, |
| "grad_norm": 5.28125, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 6.711800878718144e-09, |
| "loss": 0.1708, |
| "step": 985, |
| "student_loss": 0.0012468647910282016, |
| "teacher_loss": 0.0008435134077444673 |
| }, |
| { |
| "epoch": 0.28546712802768165, |
| "grad_norm": 9.3125, |
| "kd_loss": 0.09130859375, |
| "learning_rate": 3.1727490318111953e-09, |
| "loss": 0.1632, |
| "step": 990, |
| "student_loss": 0.002036402700468898, |
| "teacher_loss": 0.0007938549388200045 |
| }, |
| { |
| "epoch": 0.2869088811995386, |
| "grad_norm": 4.5, |
| "kd_loss": 0.10986328125, |
| "learning_rate": 9.440285301370865e-10, |
| "loss": 0.183, |
| "step": 995, |
| "student_loss": 0.0015997332520782948, |
| "teacher_loss": 0.00439803209155798 |
| }, |
| { |
| "epoch": 0.28835063437139563, |
| "grad_norm": 2.375, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 2.622381702066523e-11, |
| "loss": 0.1477, |
| "step": 1000, |
| "student_loss": 0.07939934730529785, |
| "teacher_loss": 0.0005344336968846619 |
| }, |
| { |
| "epoch": 0.28835063437139563, |
| "kd_loss": 0.09423828125, |
| "step": 1000, |
| "student_loss": 0.07939934730529785, |
| "teacher_loss": 0.0005344336968846619, |
| "total_flos": 0.0, |
| "train_loss": 0.22623604363203048, |
| "train_runtime": 7596.2458, |
| "train_samples_per_second": 2.106, |
| "train_steps_per_second": 0.132 |
| }, |
| { |
| "epoch": 0.2897923875432526, |
| "grad_norm": 3.265625, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 7.75705864825114e-06, |
| "loss": 0.1639, |
| "step": 1005, |
| "student_loss": 0.021362992003560066, |
| "teacher_loss": 0.07951661199331284 |
| }, |
| { |
| "epoch": 0.29123414071510956, |
| "grad_norm": 7.375, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 7.734502946076656e-06, |
| "loss": 0.1608, |
| "step": 1010, |
| "student_loss": 0.0017111932393163443, |
| "teacher_loss": 0.004124164581298828 |
| }, |
| { |
| "epoch": 0.2926758938869666, |
| "grad_norm": 5.3125, |
| "kd_loss": 0.146484375, |
| "learning_rate": 7.711867567242769e-06, |
| "loss": 0.1511, |
| "step": 1015, |
| "student_loss": 0.0037448785733431578, |
| "teacher_loss": 0.01143695879727602 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 3.046875, |
| "kd_loss": 0.1767578125, |
| "learning_rate": 7.689153171288487e-06, |
| "loss": 0.1481, |
| "step": 1020, |
| "student_loss": 0.018935445696115494, |
| "teacher_loss": 0.03223176300525665 |
| }, |
| { |
| "epoch": 0.2955594002306805, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 7.666360420055188e-06, |
| "loss": 0.1648, |
| "step": 1025, |
| "student_loss": 0.00270785391330719, |
| "teacher_loss": 0.0004231084603816271 |
| }, |
| { |
| "epoch": 0.29700115340253747, |
| "grad_norm": 4.71875, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 7.643489977667327e-06, |
| "loss": 0.1659, |
| "step": 1030, |
| "student_loss": 0.02544678933918476, |
| "teacher_loss": 0.0005848580040037632 |
| }, |
| { |
| "epoch": 0.2984429065743945, |
| "grad_norm": 4.75, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 7.6205425105130855e-06, |
| "loss": 0.1671, |
| "step": 1035, |
| "student_loss": 0.009377697482705116, |
| "teacher_loss": 0.002407669322565198 |
| }, |
| { |
| "epoch": 0.29988465974625145, |
| "grad_norm": 6.0, |
| "kd_loss": 0.1005859375, |
| "learning_rate": 7.597518687224959e-06, |
| "loss": 0.1854, |
| "step": 1040, |
| "student_loss": 0.09456347674131393, |
| "teacher_loss": 0.0008634831756353378 |
| }, |
| { |
| "epoch": 0.3013264129181084, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.07958984375, |
| "learning_rate": 7.574419178660269e-06, |
| "loss": 0.1669, |
| "step": 1045, |
| "student_loss": 0.0017204463947564363, |
| "teacher_loss": 0.0008596886764280498 |
| }, |
| { |
| "epoch": 0.3027681660899654, |
| "grad_norm": 6.21875, |
| "kd_loss": 0.095703125, |
| "learning_rate": 7.551244657881618e-06, |
| "loss": 0.1942, |
| "step": 1050, |
| "student_loss": 0.16669750213623047, |
| "teacher_loss": 0.0011612839298322797 |
| }, |
| { |
| "epoch": 0.3042099192618224, |
| "grad_norm": 2.484375, |
| "kd_loss": 0.09375, |
| "learning_rate": 7.527995800137287e-06, |
| "loss": 0.1475, |
| "step": 1055, |
| "student_loss": 0.0016981420340016484, |
| "teacher_loss": 0.001001509721390903 |
| }, |
| { |
| "epoch": 0.30565167243367936, |
| "grad_norm": 5.21875, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 7.504673282841544e-06, |
| "loss": 0.1647, |
| "step": 1060, |
| "student_loss": 0.09439224749803543, |
| "teacher_loss": 0.0003985276853200048 |
| }, |
| { |
| "epoch": 0.3070934256055363, |
| "grad_norm": 4.875, |
| "kd_loss": 0.1044921875, |
| "learning_rate": 7.481277785554918e-06, |
| "loss": 0.161, |
| "step": 1065, |
| "student_loss": 0.059324074536561966, |
| "teacher_loss": 0.0028861502651125193 |
| }, |
| { |
| "epoch": 0.30853517877739334, |
| "grad_norm": 6.78125, |
| "kd_loss": 0.11474609375, |
| "learning_rate": 7.457809989964393e-06, |
| "loss": 0.1812, |
| "step": 1070, |
| "student_loss": 0.0447225496172905, |
| "teacher_loss": 0.00039993959944695234 |
| }, |
| { |
| "epoch": 0.3099769319492503, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.099609375, |
| "learning_rate": 7.434270579863549e-06, |
| "loss": 0.1539, |
| "step": 1075, |
| "student_loss": 0.0011834139004349709, |
| "teacher_loss": 0.0010074133751913905 |
| }, |
| { |
| "epoch": 0.31141868512110726, |
| "grad_norm": 4.21875, |
| "kd_loss": 0.1005859375, |
| "learning_rate": 7.4106602411326345e-06, |
| "loss": 0.1642, |
| "step": 1080, |
| "student_loss": 0.003048022510483861, |
| "teacher_loss": 0.00988290086388588 |
| }, |
| { |
| "epoch": 0.3128604382929642, |
| "grad_norm": 4.0625, |
| "kd_loss": 0.09765625, |
| "learning_rate": 7.386979661718585e-06, |
| "loss": 0.1702, |
| "step": 1085, |
| "student_loss": 0.003489202819764614, |
| "teacher_loss": 0.0008156410767696798 |
| }, |
| { |
| "epoch": 0.31430219146482125, |
| "grad_norm": 5.65625, |
| "kd_loss": 0.0888671875, |
| "learning_rate": 7.363229531614973e-06, |
| "loss": 0.1515, |
| "step": 1090, |
| "student_loss": 0.2183372676372528, |
| "teacher_loss": 0.004773187451064587 |
| }, |
| { |
| "epoch": 0.3157439446366782, |
| "grad_norm": 4.125, |
| "kd_loss": 0.1171875, |
| "learning_rate": 7.339410542841906e-06, |
| "loss": 0.1799, |
| "step": 1095, |
| "student_loss": 0.13511748611927032, |
| "teacher_loss": 0.00648617185652256 |
| }, |
| { |
| "epoch": 0.31718569780853517, |
| "grad_norm": 5.03125, |
| "kd_loss": 0.10498046875, |
| "learning_rate": 7.315523389425867e-06, |
| "loss": 0.1607, |
| "step": 1100, |
| "student_loss": 0.0012231277069076896, |
| "teacher_loss": 0.0004083520616404712 |
| }, |
| { |
| "epoch": 0.31862745098039214, |
| "grad_norm": 4.0625, |
| "kd_loss": 0.0859375, |
| "learning_rate": 7.291568767379484e-06, |
| "loss": 0.144, |
| "step": 1105, |
| "student_loss": 0.016774829477071762, |
| "teacher_loss": 0.0005346160614863038 |
| }, |
| { |
| "epoch": 0.32006920415224915, |
| "grad_norm": 3.984375, |
| "kd_loss": 0.095703125, |
| "learning_rate": 7.267547374681259e-06, |
| "loss": 0.1602, |
| "step": 1110, |
| "student_loss": 0.024096982553601265, |
| "teacher_loss": 0.0008525612065568566 |
| }, |
| { |
| "epoch": 0.3215109573241061, |
| "grad_norm": 4.59375, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 7.24345991125522e-06, |
| "loss": 0.1532, |
| "step": 1115, |
| "student_loss": 0.0033125807531177998, |
| "teacher_loss": 0.0005502361455000937 |
| }, |
| { |
| "epoch": 0.3229527104959631, |
| "grad_norm": 5.59375, |
| "kd_loss": 0.107421875, |
| "learning_rate": 7.219307078950536e-06, |
| "loss": 0.1625, |
| "step": 1120, |
| "student_loss": 0.0204778965562582, |
| "teacher_loss": 0.004392318893224001 |
| }, |
| { |
| "epoch": 0.32439446366782004, |
| "grad_norm": 10.0625, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 7.195089581521064e-06, |
| "loss": 0.1654, |
| "step": 1125, |
| "student_loss": 0.19389592111110687, |
| "teacher_loss": 0.0214696004986763 |
| }, |
| { |
| "epoch": 0.32583621683967706, |
| "grad_norm": 3.859375, |
| "kd_loss": 0.10986328125, |
| "learning_rate": 7.170808124604842e-06, |
| "loss": 0.1556, |
| "step": 1130, |
| "student_loss": 0.03847292810678482, |
| "teacher_loss": 0.0006047665374353528 |
| }, |
| { |
| "epoch": 0.327277970011534, |
| "grad_norm": 4.96875, |
| "kd_loss": 0.091796875, |
| "learning_rate": 7.14646341570353e-06, |
| "loss": 0.1696, |
| "step": 1135, |
| "student_loss": 0.06856270879507065, |
| "teacher_loss": 0.01667657122015953 |
| }, |
| { |
| "epoch": 0.328719723183391, |
| "grad_norm": 3.90625, |
| "kd_loss": 0.1142578125, |
| "learning_rate": 7.122056164161795e-06, |
| "loss": 0.1778, |
| "step": 1140, |
| "student_loss": 0.021477092057466507, |
| "teacher_loss": 0.012335257604718208 |
| }, |
| { |
| "epoch": 0.330161476355248, |
| "grad_norm": 11.3125, |
| "kd_loss": 0.080078125, |
| "learning_rate": 7.097587081146636e-06, |
| "loss": 0.1589, |
| "step": 1145, |
| "student_loss": 0.045279014855623245, |
| "teacher_loss": 0.0029319608584046364 |
| }, |
| { |
| "epoch": 0.33160322952710497, |
| "grad_norm": 4.125, |
| "kd_loss": 0.08203125, |
| "learning_rate": 7.073056879626681e-06, |
| "loss": 0.204, |
| "step": 1150, |
| "student_loss": 0.002648564986884594, |
| "teacher_loss": 0.0008686608052812517 |
| }, |
| { |
| "epoch": 0.33304498269896193, |
| "grad_norm": 2.640625, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 7.048466274351389e-06, |
| "loss": 0.1497, |
| "step": 1155, |
| "student_loss": 0.058320529758930206, |
| "teacher_loss": 0.00035479728830978274 |
| }, |
| { |
| "epoch": 0.3344867358708189, |
| "grad_norm": 6.4375, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 7.023815981830236e-06, |
| "loss": 0.1904, |
| "step": 1160, |
| "student_loss": 0.0025387869682163, |
| "teacher_loss": 0.028956690803170204 |
| }, |
| { |
| "epoch": 0.3359284890426759, |
| "grad_norm": 3.53125, |
| "kd_loss": 0.10302734375, |
| "learning_rate": 6.999106720311846e-06, |
| "loss": 0.1704, |
| "step": 1165, |
| "student_loss": 0.08381687104701996, |
| "teacher_loss": 0.000761769013479352 |
| }, |
| { |
| "epoch": 0.3373702422145329, |
| "grad_norm": 4.03125, |
| "kd_loss": 0.11669921875, |
| "learning_rate": 6.974339209763043e-06, |
| "loss": 0.1536, |
| "step": 1170, |
| "student_loss": 0.021977189928293228, |
| "teacher_loss": 0.02045821212232113 |
| }, |
| { |
| "epoch": 0.33881199538638984, |
| "grad_norm": 5.125, |
| "kd_loss": 0.09033203125, |
| "learning_rate": 6.949514171847891e-06, |
| "loss": 0.1685, |
| "step": 1175, |
| "student_loss": 0.004976021591573954, |
| "teacher_loss": 0.0024228477850556374 |
| }, |
| { |
| "epoch": 0.3402537485582468, |
| "grad_norm": 4.90625, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 6.924632329906657e-06, |
| "loss": 0.1613, |
| "step": 1180, |
| "student_loss": 0.008308586664497852, |
| "teacher_loss": 0.0010117895435541868 |
| }, |
| { |
| "epoch": 0.3416955017301038, |
| "grad_norm": 2.96875, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 6.899694408934734e-06, |
| "loss": 0.1462, |
| "step": 1185, |
| "student_loss": 0.0045226323418319225, |
| "teacher_loss": 0.000678456446621567 |
| }, |
| { |
| "epoch": 0.3431372549019608, |
| "grad_norm": 2.359375, |
| "kd_loss": 0.09912109375, |
| "learning_rate": 6.874701135561524e-06, |
| "loss": 0.1473, |
| "step": 1190, |
| "student_loss": 0.0010705965105444193, |
| "teacher_loss": 0.0005625460762530565 |
| }, |
| { |
| "epoch": 0.34457900807381775, |
| "grad_norm": 3.90625, |
| "kd_loss": 0.0869140625, |
| "learning_rate": 6.849653238029261e-06, |
| "loss": 0.144, |
| "step": 1195, |
| "student_loss": 0.03853433579206467, |
| "teacher_loss": 0.0004980422672815621 |
| }, |
| { |
| "epoch": 0.3460207612456747, |
| "grad_norm": 2.109375, |
| "kd_loss": 0.08984375, |
| "learning_rate": 6.824551446171788e-06, |
| "loss": 0.2125, |
| "step": 1200, |
| "student_loss": 0.0008995746029540896, |
| "teacher_loss": 0.0007796635036356747 |
| }, |
| { |
| "epoch": 0.34746251441753173, |
| "grad_norm": 4.03125, |
| "kd_loss": 0.0888671875, |
| "learning_rate": 6.7993964913932975e-06, |
| "loss": 0.1821, |
| "step": 1205, |
| "student_loss": 0.07766856998205185, |
| "teacher_loss": 0.0004231579077895731 |
| }, |
| { |
| "epoch": 0.3489042675893887, |
| "grad_norm": 3.3125, |
| "kd_loss": 0.091796875, |
| "learning_rate": 6.774189106647021e-06, |
| "loss": 0.1555, |
| "step": 1210, |
| "student_loss": 0.002076697302982211, |
| "teacher_loss": 0.0008232980617322028 |
| }, |
| { |
| "epoch": 0.35034602076124566, |
| "grad_norm": 5.65625, |
| "kd_loss": 0.095703125, |
| "learning_rate": 6.748930026413865e-06, |
| "loss": 0.1712, |
| "step": 1215, |
| "student_loss": 0.11520007997751236, |
| "teacher_loss": 0.0005120415589772165 |
| }, |
| { |
| "epoch": 0.3517877739331027, |
| "grad_norm": 4.125, |
| "kd_loss": 0.11279296875, |
| "learning_rate": 6.7236199866810185e-06, |
| "loss": 0.164, |
| "step": 1220, |
| "student_loss": 0.06622859835624695, |
| "teacher_loss": 0.010332350619137287 |
| }, |
| { |
| "epoch": 0.35322952710495964, |
| "grad_norm": 4.75, |
| "kd_loss": 0.10595703125, |
| "learning_rate": 6.698259724920503e-06, |
| "loss": 0.1654, |
| "step": 1225, |
| "student_loss": 0.03172338008880615, |
| "teacher_loss": 0.0024025817401707172 |
| }, |
| { |
| "epoch": 0.3546712802768166, |
| "grad_norm": 4.96875, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 6.672849980067685e-06, |
| "loss": 0.1646, |
| "step": 1230, |
| "student_loss": 0.0014958116225898266, |
| "teacher_loss": 0.00136648362968117 |
| }, |
| { |
| "epoch": 0.35611303344867357, |
| "grad_norm": 7.4375, |
| "kd_loss": 0.10791015625, |
| "learning_rate": 6.647391492499746e-06, |
| "loss": 0.1467, |
| "step": 1235, |
| "student_loss": 0.001844382262788713, |
| "teacher_loss": 0.0012089475058019161 |
| }, |
| { |
| "epoch": 0.3575547866205306, |
| "grad_norm": 5.125, |
| "kd_loss": 0.09375, |
| "learning_rate": 6.621885004014113e-06, |
| "loss": 0.1856, |
| "step": 1240, |
| "student_loss": 0.0013189911842346191, |
| "teacher_loss": 0.0011863983236253262 |
| }, |
| { |
| "epoch": 0.35899653979238755, |
| "grad_norm": 3.6875, |
| "kd_loss": 0.10986328125, |
| "learning_rate": 6.596331257806837e-06, |
| "loss": 0.1588, |
| "step": 1245, |
| "student_loss": 0.0016421154141426086, |
| "teacher_loss": 0.001257838448509574 |
| }, |
| { |
| "epoch": 0.3604382929642445, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.1083984375, |
| "learning_rate": 6.570730998450945e-06, |
| "loss": 0.1663, |
| "step": 1250, |
| "student_loss": 0.19827650487422943, |
| "teacher_loss": 0.002391376066952944 |
| }, |
| { |
| "epoch": 0.3618800461361015, |
| "grad_norm": 4.84375, |
| "kd_loss": 0.109375, |
| "learning_rate": 6.545084971874738e-06, |
| "loss": 0.1693, |
| "step": 1255, |
| "student_loss": 0.033441461622714996, |
| "teacher_loss": 0.007763924542814493 |
| }, |
| { |
| "epoch": 0.3633217993079585, |
| "grad_norm": 4.875, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 6.519393925340067e-06, |
| "loss": 0.1687, |
| "step": 1260, |
| "student_loss": 0.0005883485428057611, |
| "teacher_loss": 0.0006072800024412572 |
| }, |
| { |
| "epoch": 0.36476355247981546, |
| "grad_norm": 5.0, |
| "kd_loss": 0.1123046875, |
| "learning_rate": 6.49365860742055e-06, |
| "loss": 0.1807, |
| "step": 1265, |
| "student_loss": 0.3620021343231201, |
| "teacher_loss": 0.0192008875310421 |
| }, |
| { |
| "epoch": 0.3662053056516724, |
| "grad_norm": 2.546875, |
| "kd_loss": 0.09033203125, |
| "learning_rate": 6.467879767979764e-06, |
| "loss": 0.1367, |
| "step": 1270, |
| "student_loss": 0.0019303744193166494, |
| "teacher_loss": 0.0008835737244226038 |
| }, |
| { |
| "epoch": 0.36764705882352944, |
| "grad_norm": 4.84375, |
| "kd_loss": 0.09765625, |
| "learning_rate": 6.442058158149396e-06, |
| "loss": 0.1364, |
| "step": 1275, |
| "student_loss": 0.017311925068497658, |
| "teacher_loss": 0.010822150856256485 |
| }, |
| { |
| "epoch": 0.3690888119953864, |
| "grad_norm": 2.390625, |
| "kd_loss": 0.08837890625, |
| "learning_rate": 6.4161945303073535e-06, |
| "loss": 0.1339, |
| "step": 1280, |
| "student_loss": 0.0016525188693776727, |
| "teacher_loss": 0.00045569639769382775 |
| }, |
| { |
| "epoch": 0.37053056516724336, |
| "grad_norm": 2.59375, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 6.390289638055851e-06, |
| "loss": 0.1514, |
| "step": 1285, |
| "student_loss": 0.0017534851795062423, |
| "teacher_loss": 0.000792986829765141 |
| }, |
| { |
| "epoch": 0.3719723183391003, |
| "grad_norm": 5.15625, |
| "kd_loss": 0.08203125, |
| "learning_rate": 6.364344236199441e-06, |
| "loss": 0.1544, |
| "step": 1290, |
| "student_loss": 0.4242388606071472, |
| "teacher_loss": 0.015242666937410831 |
| }, |
| { |
| "epoch": 0.37341407151095735, |
| "grad_norm": 4.0, |
| "kd_loss": 0.08251953125, |
| "learning_rate": 6.3383590807230264e-06, |
| "loss": 0.1732, |
| "step": 1295, |
| "student_loss": 0.0017893314361572266, |
| "teacher_loss": 0.005244513973593712 |
| }, |
| { |
| "epoch": 0.3748558246828143, |
| "grad_norm": 4.875, |
| "kd_loss": 0.09765625, |
| "learning_rate": 6.3123349287698345e-06, |
| "loss": 0.1343, |
| "step": 1300, |
| "student_loss": 0.0016616806387901306, |
| "teacher_loss": 0.000766773009672761 |
| }, |
| { |
| "epoch": 0.3762975778546713, |
| "grad_norm": 4.34375, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 6.286272538619351e-06, |
| "loss": 0.1656, |
| "step": 1305, |
| "student_loss": 0.001378720044158399, |
| "teacher_loss": 0.2784559726715088 |
| }, |
| { |
| "epoch": 0.37773933102652824, |
| "grad_norm": 4.78125, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 6.260172669665233e-06, |
| "loss": 0.1376, |
| "step": 1310, |
| "student_loss": 0.0015898743877187371, |
| "teacher_loss": 0.001270298846065998 |
| }, |
| { |
| "epoch": 0.37918108419838525, |
| "grad_norm": 6.34375, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 6.234036082393171e-06, |
| "loss": 0.1719, |
| "step": 1315, |
| "student_loss": 0.2977891266345978, |
| "teacher_loss": 0.0018072956008836627 |
| }, |
| { |
| "epoch": 0.3806228373702422, |
| "grad_norm": 4.5, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 6.207863538358741e-06, |
| "loss": 0.166, |
| "step": 1320, |
| "student_loss": 0.002190067432820797, |
| "teacher_loss": 0.0004770367522723973 |
| }, |
| { |
| "epoch": 0.3820645905420992, |
| "grad_norm": 4.625, |
| "kd_loss": 0.10546875, |
| "learning_rate": 6.181655800165207e-06, |
| "loss": 0.1752, |
| "step": 1325, |
| "student_loss": 0.014989044517278671, |
| "teacher_loss": 0.0011476778890937567 |
| }, |
| { |
| "epoch": 0.38350634371395614, |
| "grad_norm": 3.609375, |
| "kd_loss": 0.08740234375, |
| "learning_rate": 6.155413631441307e-06, |
| "loss": 0.1513, |
| "step": 1330, |
| "student_loss": 0.04820695146918297, |
| "teacher_loss": 0.0003813351795542985 |
| }, |
| { |
| "epoch": 0.38494809688581316, |
| "grad_norm": 7.1875, |
| "kd_loss": 0.09228515625, |
| "learning_rate": 6.129137796818997e-06, |
| "loss": 0.149, |
| "step": 1335, |
| "student_loss": 0.0018507987260818481, |
| "teacher_loss": 0.0005417931824922562 |
| }, |
| { |
| "epoch": 0.3863898500576701, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.1728515625, |
| "learning_rate": 6.102829061911176e-06, |
| "loss": 0.1629, |
| "step": 1340, |
| "student_loss": 0.0006290597375482321, |
| "teacher_loss": 0.007059386931359768 |
| }, |
| { |
| "epoch": 0.3878316032295271, |
| "grad_norm": 4.53125, |
| "kd_loss": 0.09375, |
| "learning_rate": 6.076488193289375e-06, |
| "loss": 0.154, |
| "step": 1345, |
| "student_loss": 0.001199022983200848, |
| "teacher_loss": 0.001256449380889535 |
| }, |
| { |
| "epoch": 0.3892733564013841, |
| "grad_norm": 3.796875, |
| "kd_loss": 0.09912109375, |
| "learning_rate": 6.050115958461423e-06, |
| "loss": 0.1423, |
| "step": 1350, |
| "student_loss": 0.028213880956172943, |
| "teacher_loss": 0.0014634531689807773 |
| }, |
| { |
| "epoch": 0.39071510957324107, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 6.02371312584908e-06, |
| "loss": 0.1606, |
| "step": 1355, |
| "student_loss": 0.046754222363233566, |
| "teacher_loss": 0.0003306324942968786 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 6.75, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 5.997280464765655e-06, |
| "loss": 0.1687, |
| "step": 1360, |
| "student_loss": 0.01089841965585947, |
| "teacher_loss": 0.0005644945777021348 |
| }, |
| { |
| "epoch": 0.393598615916955, |
| "grad_norm": 3.734375, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 5.970818745393579e-06, |
| "loss": 0.1514, |
| "step": 1365, |
| "student_loss": 0.012727648951113224, |
| "teacher_loss": 0.04049056023359299 |
| }, |
| { |
| "epoch": 0.395040369088812, |
| "grad_norm": 4.59375, |
| "kd_loss": 0.095703125, |
| "learning_rate": 5.9443287387619754e-06, |
| "loss": 0.1645, |
| "step": 1370, |
| "student_loss": 0.3426652252674103, |
| "teacher_loss": 0.02165866084396839 |
| }, |
| { |
| "epoch": 0.396482122260669, |
| "grad_norm": 6.65625, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 5.9178112167241805e-06, |
| "loss": 0.1544, |
| "step": 1375, |
| "student_loss": 0.09628524631261826, |
| "teacher_loss": 0.0004050543357152492 |
| }, |
| { |
| "epoch": 0.39792387543252594, |
| "grad_norm": 2.828125, |
| "kd_loss": 0.099609375, |
| "learning_rate": 5.8912669519352725e-06, |
| "loss": 0.1484, |
| "step": 1380, |
| "student_loss": 0.024134894832968712, |
| "teacher_loss": 0.0030764644034206867 |
| }, |
| { |
| "epoch": 0.3993656286043829, |
| "grad_norm": 4.5625, |
| "kd_loss": 0.0849609375, |
| "learning_rate": 5.864696717829539e-06, |
| "loss": 0.1566, |
| "step": 1385, |
| "student_loss": 0.06617551296949387, |
| "teacher_loss": 0.011502874083817005 |
| }, |
| { |
| "epoch": 0.4008073817762399, |
| "grad_norm": 4.6875, |
| "kd_loss": 0.09619140625, |
| "learning_rate": 5.838101288597951e-06, |
| "loss": 0.1487, |
| "step": 1390, |
| "student_loss": 0.0014513310743495822, |
| "teacher_loss": 0.0005679084570147097 |
| }, |
| { |
| "epoch": 0.4022491349480969, |
| "grad_norm": 3.421875, |
| "kd_loss": 0.08984375, |
| "learning_rate": 5.8114814391656046e-06, |
| "loss": 0.1609, |
| "step": 1395, |
| "student_loss": 0.001117706298828125, |
| "teacher_loss": 0.00049404869787395 |
| }, |
| { |
| "epoch": 0.40369088811995385, |
| "grad_norm": 3.375, |
| "kd_loss": 0.091796875, |
| "learning_rate": 5.78483794516914e-06, |
| "loss": 0.1509, |
| "step": 1400, |
| "student_loss": 0.005229848902672529, |
| "teacher_loss": 0.00046830569044686854 |
| }, |
| { |
| "epoch": 0.40513264129181087, |
| "grad_norm": 3.28125, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 5.75817158293414e-06, |
| "loss": 0.1549, |
| "step": 1405, |
| "student_loss": 0.000877993879839778, |
| "teacher_loss": 0.0005521044950000942 |
| }, |
| { |
| "epoch": 0.40657439446366783, |
| "grad_norm": 4.25, |
| "kd_loss": 0.0927734375, |
| "learning_rate": 5.731483129452514e-06, |
| "loss": 0.1684, |
| "step": 1410, |
| "student_loss": 0.008468794636428356, |
| "teacher_loss": 0.0004955396871082485 |
| }, |
| { |
| "epoch": 0.4080161476355248, |
| "grad_norm": 4.78125, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 5.704773362359854e-06, |
| "loss": 0.1529, |
| "step": 1415, |
| "student_loss": 0.023767048493027687, |
| "teacher_loss": 0.014984571374952793 |
| }, |
| { |
| "epoch": 0.40945790080738176, |
| "grad_norm": 4.9375, |
| "kd_loss": 0.103515625, |
| "learning_rate": 5.678043059912776e-06, |
| "loss": 0.1818, |
| "step": 1420, |
| "student_loss": 0.0011405398836359382, |
| "teacher_loss": 0.0018354527419432998 |
| }, |
| { |
| "epoch": 0.4108996539792388, |
| "grad_norm": 3.859375, |
| "kd_loss": 0.0849609375, |
| "learning_rate": 5.6512930009662524e-06, |
| "loss": 0.1643, |
| "step": 1425, |
| "student_loss": 0.20638686418533325, |
| "teacher_loss": 0.0004497423942666501 |
| }, |
| { |
| "epoch": 0.41234140715109574, |
| "grad_norm": 5.15625, |
| "kd_loss": 0.111328125, |
| "learning_rate": 5.624523964950903e-06, |
| "loss": 0.1493, |
| "step": 1430, |
| "student_loss": 0.003926432225853205, |
| "teacher_loss": 0.0005563534796237946 |
| }, |
| { |
| "epoch": 0.4137831603229527, |
| "grad_norm": 2.734375, |
| "kd_loss": 0.08935546875, |
| "learning_rate": 5.597736731850295e-06, |
| "loss": 0.164, |
| "step": 1435, |
| "student_loss": 0.012115873396396637, |
| "teacher_loss": 0.00135420064907521 |
| }, |
| { |
| "epoch": 0.41522491349480967, |
| "grad_norm": 8.375, |
| "kd_loss": 0.095703125, |
| "learning_rate": 5.570932082178219e-06, |
| "loss": 0.1733, |
| "step": 1440, |
| "student_loss": 0.05656226724386215, |
| "teacher_loss": 0.0003561509947758168 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 5.25, |
| "kd_loss": 0.1005859375, |
| "learning_rate": 5.5441107969559315e-06, |
| "loss": 0.1578, |
| "step": 1445, |
| "student_loss": 0.2771185636520386, |
| "teacher_loss": 0.002755317836999893 |
| }, |
| { |
| "epoch": 0.41810841983852365, |
| "grad_norm": 4.1875, |
| "kd_loss": 0.138671875, |
| "learning_rate": 5.517273657689419e-06, |
| "loss": 0.1413, |
| "step": 1450, |
| "student_loss": 0.004985239822417498, |
| "teacher_loss": 0.002304993337020278 |
| }, |
| { |
| "epoch": 0.4195501730103806, |
| "grad_norm": 5.625, |
| "kd_loss": 0.10595703125, |
| "learning_rate": 5.490421446346608e-06, |
| "loss": 0.1495, |
| "step": 1455, |
| "student_loss": 0.002044258639216423, |
| "teacher_loss": 0.0006097870063968003 |
| }, |
| { |
| "epoch": 0.4209919261822376, |
| "grad_norm": 3.890625, |
| "kd_loss": 0.1123046875, |
| "learning_rate": 5.463554945334589e-06, |
| "loss": 0.1499, |
| "step": 1460, |
| "student_loss": 0.001577138900756836, |
| "teacher_loss": 0.00036811313475482166 |
| }, |
| { |
| "epoch": 0.4224336793540946, |
| "grad_norm": 4.46875, |
| "kd_loss": 0.126953125, |
| "learning_rate": 5.43667493747682e-06, |
| "loss": 0.1629, |
| "step": 1465, |
| "student_loss": 0.0005169888027012348, |
| "teacher_loss": 0.015567619353532791 |
| }, |
| { |
| "epoch": 0.42387543252595156, |
| "grad_norm": 5.75, |
| "kd_loss": 0.0859375, |
| "learning_rate": 5.409782205990317e-06, |
| "loss": 0.1757, |
| "step": 1470, |
| "student_loss": 0.0012229635613039136, |
| "teacher_loss": 0.0044844611547887325 |
| }, |
| { |
| "epoch": 0.4253171856978085, |
| "grad_norm": 6.8125, |
| "kd_loss": 0.0888671875, |
| "learning_rate": 5.3828775344628245e-06, |
| "loss": 0.1525, |
| "step": 1475, |
| "student_loss": 0.0009603102807886899, |
| "teacher_loss": 0.0009318754309788346 |
| }, |
| { |
| "epoch": 0.42675893886966554, |
| "grad_norm": 3.734375, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 5.355961706829997e-06, |
| "loss": 0.1638, |
| "step": 1480, |
| "student_loss": 0.0015584760112687945, |
| "teacher_loss": 0.0012691307347267866 |
| }, |
| { |
| "epoch": 0.4282006920415225, |
| "grad_norm": 9.375, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 5.329035507352548e-06, |
| "loss": 0.1616, |
| "step": 1485, |
| "student_loss": 0.0008603151072748005, |
| "teacher_loss": 0.0008554637315683067 |
| }, |
| { |
| "epoch": 0.42964244521337946, |
| "grad_norm": 8.375, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 5.3020997205933985e-06, |
| "loss": 0.1486, |
| "step": 1490, |
| "student_loss": 0.10303473472595215, |
| "teacher_loss": 0.005800185259431601 |
| }, |
| { |
| "epoch": 0.43108419838523643, |
| "grad_norm": 3.4375, |
| "kd_loss": 0.09033203125, |
| "learning_rate": 5.275155131394825e-06, |
| "loss": 0.1504, |
| "step": 1495, |
| "student_loss": 0.002375382697209716, |
| "teacher_loss": 0.001016065594740212 |
| }, |
| { |
| "epoch": 0.43252595155709345, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.0927734375, |
| "learning_rate": 5.248202524855578e-06, |
| "loss": 0.1497, |
| "step": 1500, |
| "student_loss": 0.046541083604097366, |
| "teacher_loss": 0.0038300170563161373 |
| }, |
| { |
| "epoch": 0.4339677047289504, |
| "grad_norm": 2.84375, |
| "kd_loss": 0.08642578125, |
| "learning_rate": 5.221242686308019e-06, |
| "loss": 0.1424, |
| "step": 1505, |
| "student_loss": 0.04092458263039589, |
| "teacher_loss": 0.0005184438778087497 |
| }, |
| { |
| "epoch": 0.4354094579008074, |
| "grad_norm": 5.9375, |
| "kd_loss": 0.14453125, |
| "learning_rate": 5.194276401295231e-06, |
| "loss": 0.1581, |
| "step": 1510, |
| "student_loss": 0.11372507363557816, |
| "teacher_loss": 0.012486970983445644 |
| }, |
| { |
| "epoch": 0.43685121107266434, |
| "grad_norm": 5.6875, |
| "kd_loss": 0.103515625, |
| "learning_rate": 5.167304455548128e-06, |
| "loss": 0.1542, |
| "step": 1515, |
| "student_loss": 0.0004513502062764019, |
| "teacher_loss": 0.0004412997222971171 |
| }, |
| { |
| "epoch": 0.43829296424452135, |
| "grad_norm": 4.46875, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 5.14032763496257e-06, |
| "loss": 0.1503, |
| "step": 1520, |
| "student_loss": 0.016323139891028404, |
| "teacher_loss": 0.0005559992277994752 |
| }, |
| { |
| "epoch": 0.4397347174163783, |
| "grad_norm": 2.875, |
| "kd_loss": 0.1171875, |
| "learning_rate": 5.11334672557645e-06, |
| "loss": 0.1516, |
| "step": 1525, |
| "student_loss": 0.009535513818264008, |
| "teacher_loss": 0.001467025140300393 |
| }, |
| { |
| "epoch": 0.4411764705882353, |
| "grad_norm": 3.203125, |
| "kd_loss": 0.146484375, |
| "learning_rate": 5.086362513546807e-06, |
| "loss": 0.1389, |
| "step": 1530, |
| "student_loss": 0.1711445301771164, |
| "teacher_loss": 0.008548562414944172 |
| }, |
| { |
| "epoch": 0.4426182237600923, |
| "grad_norm": 3.96875, |
| "kd_loss": 0.09033203125, |
| "learning_rate": 5.059375785126907e-06, |
| "loss": 0.1367, |
| "step": 1535, |
| "student_loss": 0.09691781550645828, |
| "teacher_loss": 0.011890435591340065 |
| }, |
| { |
| "epoch": 0.44405997693194926, |
| "grad_norm": 3.15625, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 5.032387326643331e-06, |
| "loss": 0.15, |
| "step": 1540, |
| "student_loss": 0.11253131926059723, |
| "teacher_loss": 0.00038647381006740034 |
| }, |
| { |
| "epoch": 0.4455017301038062, |
| "grad_norm": 4.28125, |
| "kd_loss": 0.0986328125, |
| "learning_rate": 5.005397924473082e-06, |
| "loss": 0.1453, |
| "step": 1545, |
| "student_loss": 0.0029098070226609707, |
| "teacher_loss": 0.007037348113954067 |
| }, |
| { |
| "epoch": 0.4469434832756632, |
| "grad_norm": 4.5625, |
| "kd_loss": 0.08935546875, |
| "learning_rate": 4.978408365020651e-06, |
| "loss": 0.1724, |
| "step": 1550, |
| "student_loss": 0.09735474735498428, |
| "teacher_loss": 0.0007218251703307033 |
| }, |
| { |
| "epoch": 0.4483852364475202, |
| "grad_norm": 5.40625, |
| "kd_loss": 0.10888671875, |
| "learning_rate": 4.951419434695115e-06, |
| "loss": 0.1426, |
| "step": 1555, |
| "student_loss": 0.024885384365916252, |
| "teacher_loss": 0.0010013995924964547 |
| }, |
| { |
| "epoch": 0.44982698961937717, |
| "grad_norm": 4.34375, |
| "kd_loss": 0.0927734375, |
| "learning_rate": 4.924431919887216e-06, |
| "loss": 0.1592, |
| "step": 1560, |
| "student_loss": 0.0011940286494791508, |
| "teacher_loss": 0.0006662014056928456 |
| }, |
| { |
| "epoch": 0.45126874279123413, |
| "grad_norm": 4.78125, |
| "kd_loss": 0.095703125, |
| "learning_rate": 4.897446606946459e-06, |
| "loss": 0.1363, |
| "step": 1565, |
| "student_loss": 0.008483109064400196, |
| "teacher_loss": 0.0011815401958301663 |
| }, |
| { |
| "epoch": 0.4527104959630911, |
| "grad_norm": 6.09375, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 4.870464282158184e-06, |
| "loss": 0.1435, |
| "step": 1570, |
| "student_loss": 0.0016101751243695617, |
| "teacher_loss": 0.03063173033297062 |
| }, |
| { |
| "epoch": 0.4541522491349481, |
| "grad_norm": 4.75, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 4.84348573172067e-06, |
| "loss": 0.1472, |
| "step": 1575, |
| "student_loss": 0.000616877747233957, |
| "teacher_loss": 0.02498156577348709 |
| }, |
| { |
| "epoch": 0.4555940023068051, |
| "grad_norm": 6.65625, |
| "kd_loss": 0.0791015625, |
| "learning_rate": 4.816511741722215e-06, |
| "loss": 0.1727, |
| "step": 1580, |
| "student_loss": 0.09299268573522568, |
| "teacher_loss": 0.000702059711329639 |
| }, |
| { |
| "epoch": 0.45703575547866204, |
| "grad_norm": 5.3125, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 4.7895430981182415e-06, |
| "loss": 0.1725, |
| "step": 1585, |
| "student_loss": 0.001776401768438518, |
| "teacher_loss": 0.0011725560761988163 |
| }, |
| { |
| "epoch": 0.458477508650519, |
| "grad_norm": 5.375, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 4.762580586708389e-06, |
| "loss": 0.1547, |
| "step": 1590, |
| "student_loss": 0.0018952718237414956, |
| "teacher_loss": 0.0010945210233330727 |
| }, |
| { |
| "epoch": 0.459919261822376, |
| "grad_norm": 4.4375, |
| "kd_loss": 0.09619140625, |
| "learning_rate": 4.73562499311362e-06, |
| "loss": 0.1534, |
| "step": 1595, |
| "student_loss": 0.0008283228962682188, |
| "teacher_loss": 0.0009313338669016957 |
| }, |
| { |
| "epoch": 0.461361014994233, |
| "grad_norm": 3.984375, |
| "kd_loss": 0.08984375, |
| "learning_rate": 4.708677102753331e-06, |
| "loss": 0.1371, |
| "step": 1600, |
| "student_loss": 0.04035179316997528, |
| "teacher_loss": 0.002971302019432187 |
| }, |
| { |
| "epoch": 0.46280276816608995, |
| "grad_norm": 5.4375, |
| "kd_loss": 0.10888671875, |
| "learning_rate": 4.681737700822464e-06, |
| "loss": 0.1731, |
| "step": 1605, |
| "student_loss": 0.0004709873755928129, |
| "teacher_loss": 0.025587571784853935 |
| }, |
| { |
| "epoch": 0.46424452133794697, |
| "grad_norm": 3.015625, |
| "kd_loss": 0.142578125, |
| "learning_rate": 4.654807572268628e-06, |
| "loss": 0.1602, |
| "step": 1610, |
| "student_loss": 0.002385765314102173, |
| "teacher_loss": 0.0010947687551379204 |
| }, |
| { |
| "epoch": 0.46568627450980393, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 4.627887501769231e-06, |
| "loss": 0.1628, |
| "step": 1615, |
| "student_loss": 0.004139338154345751, |
| "teacher_loss": 0.009530629962682724 |
| }, |
| { |
| "epoch": 0.4671280276816609, |
| "grad_norm": 3.9375, |
| "kd_loss": 0.119140625, |
| "learning_rate": 4.600978273708612e-06, |
| "loss": 0.153, |
| "step": 1620, |
| "student_loss": 0.003188611473888159, |
| "teacher_loss": 0.001373080536723137 |
| }, |
| { |
| "epoch": 0.46856978085351786, |
| "grad_norm": 4.28125, |
| "kd_loss": 0.1025390625, |
| "learning_rate": 4.574080672155189e-06, |
| "loss": 0.1591, |
| "step": 1625, |
| "student_loss": 0.0014868304133415222, |
| "teacher_loss": 0.000639898469671607 |
| }, |
| { |
| "epoch": 0.4700115340253749, |
| "grad_norm": 3.21875, |
| "kd_loss": 0.0810546875, |
| "learning_rate": 4.547195480838612e-06, |
| "loss": 0.1515, |
| "step": 1630, |
| "student_loss": 0.040516145527362823, |
| "teacher_loss": 0.0004312426899559796 |
| }, |
| { |
| "epoch": 0.47145328719723184, |
| "grad_norm": 4.84375, |
| "kd_loss": 0.087890625, |
| "learning_rate": 4.520323483126928e-06, |
| "loss": 0.1862, |
| "step": 1635, |
| "student_loss": 0.12052398920059204, |
| "teacher_loss": 0.0003846465260721743 |
| }, |
| { |
| "epoch": 0.4728950403690888, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 4.493465462003756e-06, |
| "loss": 0.1453, |
| "step": 1640, |
| "student_loss": 0.0010070661082863808, |
| "teacher_loss": 0.0012260322691872716 |
| }, |
| { |
| "epoch": 0.47433679354094577, |
| "grad_norm": 2.875, |
| "kd_loss": 0.09375, |
| "learning_rate": 4.4666222000454685e-06, |
| "loss": 0.1545, |
| "step": 1645, |
| "student_loss": 0.0013779783621430397, |
| "teacher_loss": 0.00042751312139444053 |
| }, |
| { |
| "epoch": 0.4757785467128028, |
| "grad_norm": 7.15625, |
| "kd_loss": 0.0869140625, |
| "learning_rate": 4.4397944793983946e-06, |
| "loss": 0.1599, |
| "step": 1650, |
| "student_loss": 0.0005161279696039855, |
| "teacher_loss": 0.0007879316690377891 |
| }, |
| { |
| "epoch": 0.47722029988465975, |
| "grad_norm": 3.421875, |
| "kd_loss": 0.08203125, |
| "learning_rate": 4.4129830817560284e-06, |
| "loss": 0.1627, |
| "step": 1655, |
| "student_loss": 0.003220248268917203, |
| "teacher_loss": 0.0008016406209208071 |
| }, |
| { |
| "epoch": 0.4786620530565167, |
| "grad_norm": 4.125, |
| "kd_loss": 0.1015625, |
| "learning_rate": 4.386188788336251e-06, |
| "loss": 0.1404, |
| "step": 1660, |
| "student_loss": 0.08471440523862839, |
| "teacher_loss": 0.0003611688152886927 |
| }, |
| { |
| "epoch": 0.4801038062283737, |
| "grad_norm": 4.0, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 4.359412379858569e-06, |
| "loss": 0.1428, |
| "step": 1665, |
| "student_loss": 0.0006392439245246351, |
| "teacher_loss": 0.00034953776048496366 |
| }, |
| { |
| "epoch": 0.4815455594002307, |
| "grad_norm": 6.0, |
| "kd_loss": 0.1484375, |
| "learning_rate": 4.332654636521365e-06, |
| "loss": 0.1493, |
| "step": 1670, |
| "student_loss": 0.13810043036937714, |
| "teacher_loss": 0.005655170418322086 |
| }, |
| { |
| "epoch": 0.48298731257208766, |
| "grad_norm": 2.9375, |
| "kd_loss": 0.1015625, |
| "learning_rate": 4.3059163379791676e-06, |
| "loss": 0.1588, |
| "step": 1675, |
| "student_loss": 0.0011410564184188843, |
| "teacher_loss": 0.0009165616356767714 |
| }, |
| { |
| "epoch": 0.4844290657439446, |
| "grad_norm": 3.96875, |
| "kd_loss": 0.103515625, |
| "learning_rate": 4.279198263319932e-06, |
| "loss": 0.1983, |
| "step": 1680, |
| "student_loss": 0.042820997536182404, |
| "teacher_loss": 0.0006943390471860766 |
| }, |
| { |
| "epoch": 0.48587081891580164, |
| "grad_norm": 4.65625, |
| "kd_loss": 0.0927734375, |
| "learning_rate": 4.252501191042334e-06, |
| "loss": 0.1458, |
| "step": 1685, |
| "student_loss": 0.001107779797166586, |
| "teacher_loss": 0.000587086018640548 |
| }, |
| { |
| "epoch": 0.4873125720876586, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.08154296875, |
| "learning_rate": 4.2258258990331015e-06, |
| "loss": 0.1505, |
| "step": 1690, |
| "student_loss": 0.0010096587939187884, |
| "teacher_loss": 0.0006587179377675056 |
| }, |
| { |
| "epoch": 0.48875432525951557, |
| "grad_norm": 3.296875, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 4.199173164544331e-06, |
| "loss": 0.1301, |
| "step": 1695, |
| "student_loss": 0.0007151216268539429, |
| "teacher_loss": 0.0005271242698654532 |
| }, |
| { |
| "epoch": 0.49019607843137253, |
| "grad_norm": 2.75, |
| "kd_loss": 0.111328125, |
| "learning_rate": 4.1725437641708535e-06, |
| "loss": 0.1292, |
| "step": 1700, |
| "student_loss": 0.0004782706964761019, |
| "teacher_loss": 0.00025354631361551583 |
| }, |
| { |
| "epoch": 0.49163783160322955, |
| "grad_norm": 4.84375, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 4.145938473827598e-06, |
| "loss": 0.1694, |
| "step": 1705, |
| "student_loss": 0.03137379139661789, |
| "teacher_loss": 0.0014872003812342882 |
| }, |
| { |
| "epoch": 0.4930795847750865, |
| "grad_norm": 3.890625, |
| "kd_loss": 0.0859375, |
| "learning_rate": 4.1193580687269896e-06, |
| "loss": 0.1799, |
| "step": 1710, |
| "student_loss": 0.13360068202018738, |
| "teacher_loss": 0.0003576852031983435 |
| }, |
| { |
| "epoch": 0.4945213379469435, |
| "grad_norm": 3.6875, |
| "kd_loss": 0.0849609375, |
| "learning_rate": 4.092803323356357e-06, |
| "loss": 0.1568, |
| "step": 1715, |
| "student_loss": 0.0007690335623919964, |
| "teacher_loss": 0.0004268670454621315 |
| }, |
| { |
| "epoch": 0.49596309111880044, |
| "grad_norm": 4.53125, |
| "kd_loss": 0.09423828125, |
| "learning_rate": 4.066275011455369e-06, |
| "loss": 0.1345, |
| "step": 1720, |
| "student_loss": 0.0010869682300835848, |
| "teacher_loss": 0.0008715330623090267 |
| }, |
| { |
| "epoch": 0.49740484429065746, |
| "grad_norm": 3.78125, |
| "kd_loss": 0.10302734375, |
| "learning_rate": 4.039773905993486e-06, |
| "loss": 0.1661, |
| "step": 1725, |
| "student_loss": 0.001568131148815155, |
| "teacher_loss": 0.0013625255087390542 |
| }, |
| { |
| "epoch": 0.4988465974625144, |
| "grad_norm": 3.234375, |
| "kd_loss": 0.103515625, |
| "learning_rate": 4.013300779147445e-06, |
| "loss": 0.1311, |
| "step": 1730, |
| "student_loss": 0.003118544816970825, |
| "teacher_loss": 0.027190769091248512 |
| }, |
| { |
| "epoch": 0.5002883506343714, |
| "grad_norm": 3.984375, |
| "kd_loss": 0.1337890625, |
| "learning_rate": 3.98685640227875e-06, |
| "loss": 0.1465, |
| "step": 1735, |
| "student_loss": 0.004482457414269447, |
| "teacher_loss": 0.06759393215179443 |
| }, |
| { |
| "epoch": 0.5017301038062284, |
| "grad_norm": 4.84375, |
| "kd_loss": 0.1123046875, |
| "learning_rate": 3.960441545911205e-06, |
| "loss": 0.1692, |
| "step": 1740, |
| "student_loss": 0.0023235215339809656, |
| "teacher_loss": 0.027365142479538918 |
| }, |
| { |
| "epoch": 0.5031718569780853, |
| "grad_norm": 2.78125, |
| "kd_loss": 0.11474609375, |
| "learning_rate": 3.934056979708456e-06, |
| "loss": 0.1393, |
| "step": 1745, |
| "student_loss": 0.0015286001143977046, |
| "teacher_loss": 0.02260914258658886 |
| }, |
| { |
| "epoch": 0.5046136101499423, |
| "grad_norm": 6.3125, |
| "kd_loss": 0.10009765625, |
| "learning_rate": 3.907703472451574e-06, |
| "loss": 0.1627, |
| "step": 1750, |
| "student_loss": 0.009829165413975716, |
| "teacher_loss": 0.0006399175035767257 |
| }, |
| { |
| "epoch": 0.5060553633217993, |
| "grad_norm": 7.96875, |
| "kd_loss": 0.099609375, |
| "learning_rate": 3.881381792016645e-06, |
| "loss": 0.1749, |
| "step": 1755, |
| "student_loss": 0.0006391415954567492, |
| "teacher_loss": 0.0003294479101896286 |
| }, |
| { |
| "epoch": 0.5074971164936563, |
| "grad_norm": 3.53125, |
| "kd_loss": 0.1220703125, |
| "learning_rate": 3.8550927053523994e-06, |
| "loss": 0.1389, |
| "step": 1760, |
| "student_loss": 0.0007123491377569735, |
| "teacher_loss": 0.046919528394937515 |
| }, |
| { |
| "epoch": 0.5089388696655133, |
| "grad_norm": 5.09375, |
| "kd_loss": 0.083984375, |
| "learning_rate": 3.828836978457868e-06, |
| "loss": 0.1522, |
| "step": 1765, |
| "student_loss": 0.0016679943073540926, |
| "teacher_loss": 0.000926964043173939 |
| }, |
| { |
| "epoch": 0.5103806228373703, |
| "grad_norm": 3.671875, |
| "kd_loss": 0.1357421875, |
| "learning_rate": 3.8026153763600603e-06, |
| "loss": 0.1477, |
| "step": 1770, |
| "student_loss": 0.022712958976626396, |
| "teacher_loss": 0.029349761083722115 |
| }, |
| { |
| "epoch": 0.5118223760092272, |
| "grad_norm": 3.625, |
| "kd_loss": 0.09228515625, |
| "learning_rate": 3.7764286630916704e-06, |
| "loss": 0.1425, |
| "step": 1775, |
| "student_loss": 0.0012370613403618336, |
| "teacher_loss": 0.0007929064449854195 |
| }, |
| { |
| "epoch": 0.5132641291810842, |
| "grad_norm": 3.234375, |
| "kd_loss": 0.0947265625, |
| "learning_rate": 3.7502776016688234e-06, |
| "loss": 0.1589, |
| "step": 1780, |
| "student_loss": 0.008692040108144283, |
| "teacher_loss": 0.0190599225461483 |
| }, |
| { |
| "epoch": 0.5147058823529411, |
| "grad_norm": 4.46875, |
| "kd_loss": 0.1181640625, |
| "learning_rate": 3.724162954068835e-06, |
| "loss": 0.1568, |
| "step": 1785, |
| "student_loss": 0.1760350614786148, |
| "teacher_loss": 0.028955036774277687 |
| }, |
| { |
| "epoch": 0.5161476355247981, |
| "grad_norm": 3.28125, |
| "kd_loss": 0.083984375, |
| "learning_rate": 3.6980854812080097e-06, |
| "loss": 0.1497, |
| "step": 1790, |
| "student_loss": 0.0018669115379452705, |
| "teacher_loss": 0.0008881228277459741 |
| }, |
| { |
| "epoch": 0.5175893886966552, |
| "grad_norm": 4.6875, |
| "kd_loss": 0.08984375, |
| "learning_rate": 3.6720459429194743e-06, |
| "loss": 0.1635, |
| "step": 1795, |
| "student_loss": 0.1518515795469284, |
| "teacher_loss": 0.00034799822606146336 |
| }, |
| { |
| "epoch": 0.5190311418685121, |
| "grad_norm": 2.828125, |
| "kd_loss": 0.0869140625, |
| "learning_rate": 3.646045097931037e-06, |
| "loss": 0.1584, |
| "step": 1800, |
| "student_loss": 0.0007797479629516602, |
| "teacher_loss": 0.0005545561434701085 |
| }, |
| { |
| "epoch": 0.5204728950403691, |
| "grad_norm": 5.5, |
| "kd_loss": 0.08056640625, |
| "learning_rate": 3.620083703843077e-06, |
| "loss": 0.1433, |
| "step": 1805, |
| "student_loss": 0.03987161070108414, |
| "teacher_loss": 0.00046788767213001847 |
| }, |
| { |
| "epoch": 0.5219146482122261, |
| "grad_norm": 4.0, |
| "kd_loss": 0.095703125, |
| "learning_rate": 3.594162517106472e-06, |
| "loss": 0.1646, |
| "step": 1810, |
| "student_loss": 0.0409666933119297, |
| "teacher_loss": 0.0038134430069476366 |
| }, |
| { |
| "epoch": 0.523356401384083, |
| "grad_norm": 3.71875, |
| "kd_loss": 0.10693359375, |
| "learning_rate": 3.5682822930005567e-06, |
| "loss": 0.1424, |
| "step": 1815, |
| "student_loss": 0.30990689992904663, |
| "teacher_loss": 0.019313883036375046 |
| }, |
| { |
| "epoch": 0.52479815455594, |
| "grad_norm": 6.0625, |
| "kd_loss": 0.1015625, |
| "learning_rate": 3.542443785611117e-06, |
| "loss": 0.1455, |
| "step": 1820, |
| "student_loss": 0.05496774613857269, |
| "teacher_loss": 0.003318265313282609 |
| }, |
| { |
| "epoch": 0.526239907727797, |
| "grad_norm": 7.9375, |
| "kd_loss": 0.103515625, |
| "learning_rate": 3.516647747808417e-06, |
| "loss": 0.1445, |
| "step": 1825, |
| "student_loss": 0.0014039704110473394, |
| "teacher_loss": 0.004902483429759741 |
| }, |
| { |
| "epoch": 0.527681660899654, |
| "grad_norm": 3.84375, |
| "kd_loss": 0.08642578125, |
| "learning_rate": 3.4908949312252593e-06, |
| "loss": 0.1453, |
| "step": 1830, |
| "student_loss": 0.001090447069145739, |
| "teacher_loss": 0.0005476956139318645 |
| }, |
| { |
| "epoch": 0.529123414071511, |
| "grad_norm": 3.828125, |
| "kd_loss": 0.10595703125, |
| "learning_rate": 3.4651860862350893e-06, |
| "loss": 0.1355, |
| "step": 1835, |
| "student_loss": 0.00827399455010891, |
| "teacher_loss": 0.0005034086061641574 |
| }, |
| { |
| "epoch": 0.5305651672433679, |
| "grad_norm": 3.375, |
| "kd_loss": 0.083984375, |
| "learning_rate": 3.4395219619301288e-06, |
| "loss": 0.1429, |
| "step": 1840, |
| "student_loss": 0.02982058748602867, |
| "teacher_loss": 0.003948381636291742 |
| }, |
| { |
| "epoch": 0.5320069204152249, |
| "grad_norm": 4.0625, |
| "kd_loss": 0.10791015625, |
| "learning_rate": 3.4139033060995484e-06, |
| "loss": 0.1606, |
| "step": 1845, |
| "student_loss": 0.0009567984379827976, |
| "teacher_loss": 0.0006957401055842638 |
| }, |
| { |
| "epoch": 0.5334486735870819, |
| "grad_norm": 3.484375, |
| "kd_loss": 0.0859375, |
| "learning_rate": 3.388330865207681e-06, |
| "loss": 0.1516, |
| "step": 1850, |
| "student_loss": 0.09060114622116089, |
| "teacher_loss": 0.011022915132343769 |
| }, |
| { |
| "epoch": 0.5348904267589388, |
| "grad_norm": 3.8125, |
| "kd_loss": 0.09326171875, |
| "learning_rate": 3.3628053843722674e-06, |
| "loss": 0.1586, |
| "step": 1855, |
| "student_loss": 0.0023815552704036236, |
| "teacher_loss": 0.0009834859520196915 |
| }, |
| { |
| "epoch": 0.5363321799307958, |
| "grad_norm": 2.890625, |
| "kd_loss": 0.08642578125, |
| "learning_rate": 3.337327607342753e-06, |
| "loss": 0.1443, |
| "step": 1860, |
| "student_loss": 0.0012727677822113037, |
| "teacher_loss": 0.0003461229207459837 |
| }, |
| { |
| "epoch": 0.5377739331026529, |
| "grad_norm": 3.796875, |
| "kd_loss": 0.09765625, |
| "learning_rate": 3.3118982764786055e-06, |
| "loss": 0.1753, |
| "step": 1865, |
| "student_loss": 0.12450817972421646, |
| "teacher_loss": 0.00035991144250147045 |
| }, |
| { |
| "epoch": 0.5392156862745098, |
| "grad_norm": 6.3125, |
| "kd_loss": 0.099609375, |
| "learning_rate": 3.2865181327277007e-06, |
| "loss": 0.1487, |
| "step": 1870, |
| "student_loss": 0.14028604328632355, |
| "teacher_loss": 0.0013080085627734661 |
| }, |
| { |
| "epoch": 0.5406574394463668, |
| "grad_norm": 3.25, |
| "kd_loss": 0.08984375, |
| "learning_rate": 3.2611879156047147e-06, |
| "loss": 0.1471, |
| "step": 1875, |
| "student_loss": 0.0018398945685476065, |
| "teacher_loss": 0.0011750732082873583 |
| }, |
| { |
| "epoch": 0.5420991926182238, |
| "grad_norm": 4.53125, |
| "kd_loss": 0.11376953125, |
| "learning_rate": 3.2359083631695897e-06, |
| "loss": 0.1327, |
| "step": 1880, |
| "student_loss": 0.03754269704222679, |
| "teacher_loss": 0.0011901544639840722 |
| }, |
| { |
| "epoch": 0.5435409457900807, |
| "grad_norm": 4.625, |
| "kd_loss": 0.10400390625, |
| "learning_rate": 3.2106802120060197e-06, |
| "loss": 0.1568, |
| "step": 1885, |
| "student_loss": 0.003520218888297677, |
| "teacher_loss": 0.0013501073699444532 |
| }, |
| { |
| "epoch": 0.5449826989619377, |
| "grad_norm": 4.1875, |
| "kd_loss": 0.08203125, |
| "learning_rate": 3.185504197199999e-06, |
| "loss": 0.1376, |
| "step": 1890, |
| "student_loss": 0.008974825032055378, |
| "teacher_loss": 0.00042682504863478243 |
| }, |
| { |
| "epoch": 0.5464244521337946, |
| "grad_norm": 5.1875, |
| "kd_loss": 0.0966796875, |
| "learning_rate": 3.160381052318393e-06, |
| "loss": 0.1649, |
| "step": 1895, |
| "student_loss": 0.0789928063750267, |
| "teacher_loss": 0.0007635668735019863 |
| }, |
| { |
| "epoch": 0.5478662053056517, |
| "grad_norm": 2.859375, |
| "kd_loss": 0.0791015625, |
| "learning_rate": 3.1353115093875676e-06, |
| "loss": 0.1554, |
| "step": 1900, |
| "student_loss": 0.002470338949933648, |
| "teacher_loss": 0.0004501968214754015 |
| }, |
| { |
| "epoch": 0.5493079584775087, |
| "grad_norm": 4.46875, |
| "kd_loss": 0.09375, |
| "learning_rate": 3.1102962988720615e-06, |
| "loss": 0.1432, |
| "step": 1905, |
| "student_loss": 0.04193798825144768, |
| "teacher_loss": 0.0009529749513603747 |
| }, |
| { |
| "epoch": 0.5507497116493656, |
| "grad_norm": 5.25, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 3.085336149653303e-06, |
| "loss": 0.1487, |
| "step": 1910, |
| "student_loss": 0.000636325916275382, |
| "teacher_loss": 0.0005562056903727353 |
| }, |
| { |
| "epoch": 0.5521914648212226, |
| "grad_norm": 3.359375, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 3.060431789008368e-06, |
| "loss": 0.1681, |
| "step": 1915, |
| "student_loss": 0.0016948822885751724, |
| "teacher_loss": 0.0023001739755272865 |
| }, |
| { |
| "epoch": 0.5536332179930796, |
| "grad_norm": 3.578125, |
| "kd_loss": 0.08447265625, |
| "learning_rate": 3.035583942588791e-06, |
| "loss": 0.1655, |
| "step": 1920, |
| "student_loss": 0.055358272045850754, |
| "teacher_loss": 0.00030110430088825524 |
| }, |
| { |
| "epoch": 0.5550749711649365, |
| "grad_norm": 4.125, |
| "kd_loss": 0.09814453125, |
| "learning_rate": 3.0107933343994233e-06, |
| "loss": 0.1582, |
| "step": 1925, |
| "student_loss": 0.008619318716228008, |
| "teacher_loss": 0.0032901125960052013 |
| }, |
| { |
| "epoch": 0.5565167243367936, |
| "grad_norm": 6.75, |
| "kd_loss": 0.08349609375, |
| "learning_rate": 2.9860606867773323e-06, |
| "loss": 0.1394, |
| "step": 1930, |
| "student_loss": 0.03425801545381546, |
| "teacher_loss": 0.00030506699113175273 |
| }, |
| { |
| "epoch": 0.5579584775086506, |
| "grad_norm": 4.3125, |
| "kd_loss": 0.0908203125, |
| "learning_rate": 2.9613867203707627e-06, |
| "loss": 0.1535, |
| "step": 1935, |
| "student_loss": 0.14860902726650238, |
| "teacher_loss": 0.021592382341623306 |
| }, |
| { |
| "epoch": 0.5594002306805075, |
| "grad_norm": 4.96875, |
| "kd_loss": 0.10546875, |
| "learning_rate": 2.936772154118129e-06, |
| "loss": 0.1545, |
| "step": 1940, |
| "student_loss": 0.007172099314630032, |
| "teacher_loss": 0.000852234719786793 |
| }, |
| { |
| "epoch": 0.5608419838523645, |
| "grad_norm": 3.828125, |
| "kd_loss": 0.09619140625, |
| "learning_rate": 2.912217705227075e-06, |
| "loss": 0.1493, |
| "step": 1945, |
| "student_loss": 0.04466139152646065, |
| "teacher_loss": 0.028232689946889877 |
| }, |
| { |
| "epoch": 0.5622837370242214, |
| "grad_norm": 8.1875, |
| "kd_loss": 0.08740234375, |
| "learning_rate": 2.88772408915357e-06, |
| "loss": 0.1749, |
| "step": 1950, |
| "student_loss": 0.0008998726261779666, |
| "teacher_loss": 0.0005217011785134673 |
| }, |
| { |
| "epoch": 0.5637254901960784, |
| "grad_norm": 4.0625, |
| "kd_loss": 0.10498046875, |
| "learning_rate": 2.863292019581071e-06, |
| "loss": 0.1535, |
| "step": 1955, |
| "student_loss": 0.23264119029045105, |
| "teacher_loss": 0.0003505937347654253 |
| }, |
| { |
| "epoch": 0.5651672433679354, |
| "grad_norm": 4.5, |
| "kd_loss": 0.11572265625, |
| "learning_rate": 2.838922208399712e-06, |
| "loss": 0.1646, |
| "step": 1960, |
| "student_loss": 0.005253693088889122, |
| "teacher_loss": 0.0008160973084159195 |
| }, |
| { |
| "epoch": 0.5666089965397924, |
| "grad_norm": 3.359375, |
| "kd_loss": 0.08935546875, |
| "learning_rate": 2.8146153656855858e-06, |
| "loss": 0.1571, |
| "step": 1965, |
| "student_loss": 0.0008905039285309613, |
| "teacher_loss": 0.00038642369327135384 |
| }, |
| { |
| "epoch": 0.5680507497116494, |
| "grad_norm": 6.96875, |
| "kd_loss": 0.10107421875, |
| "learning_rate": 2.7903721996800248e-06, |
| "loss": 0.1488, |
| "step": 1970, |
| "student_loss": 0.001944546471349895, |
| "teacher_loss": 0.0004150049644522369 |
| }, |
| { |
| "epoch": 0.5694925028835064, |
| "grad_norm": 2.453125, |
| "kd_loss": 0.09716796875, |
| "learning_rate": 2.7661934167689887e-06, |
| "loss": 0.1556, |
| "step": 1975, |
| "student_loss": 0.0032470019068568945, |
| "teacher_loss": 0.0009415296372026205 |
| }, |
| { |
| "epoch": 0.5709342560553633, |
| "grad_norm": 2.25, |
| "kd_loss": 0.08642578125, |
| "learning_rate": 2.742079721462471e-06, |
| "loss": 0.1674, |
| "step": 1980, |
| "student_loss": 0.05945152789354324, |
| "teacher_loss": 0.0008091035415418446 |
| }, |
| { |
| "epoch": 0.5723760092272203, |
| "grad_norm": 3.65625, |
| "kd_loss": 0.0771484375, |
| "learning_rate": 2.7180318163739704e-06, |
| "loss": 0.1519, |
| "step": 1985, |
| "student_loss": 0.0015980260213837028, |
| "teacher_loss": 0.0005768106202594936 |
| }, |
| { |
| "epoch": 0.5738177623990772, |
| "grad_norm": 2.65625, |
| "kd_loss": 0.115234375, |
| "learning_rate": 2.6940504022000248e-06, |
| "loss": 0.1546, |
| "step": 1990, |
| "student_loss": 0.0632084533572197, |
| "teacher_loss": 0.015749456360936165 |
| }, |
| { |
| "epoch": 0.5752595155709342, |
| "grad_norm": 5.4375, |
| "kd_loss": 0.0869140625, |
| "learning_rate": 2.67013617769979e-06, |
| "loss": 0.153, |
| "step": 1995, |
| "student_loss": 0.0011626326013356447, |
| "teacher_loss": 0.000745030993130058 |
| }, |
| { |
| "epoch": 0.5767012687427913, |
| "grad_norm": 4.0, |
| "kd_loss": 0.09912109375, |
| "learning_rate": 2.6462898396746783e-06, |
| "loss": 0.1493, |
| "step": 2000, |
| "student_loss": 0.002248254604637623, |
| "teacher_loss": 0.0006202560034580529 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 3000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|