{ "best_global_step": 2000, "best_metric": 0.6022624256040361, "best_model_checkpoint": "./SALAMA_NEWMEDT/checkpoint-2000", "epoch": 2.0, "eval_steps": 2000, "global_step": 3550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005633802816901409, "grad_norm": 1.2029443979263306, "learning_rate": 1.8e-07, "loss": 0.0085, "step": 10 }, { "epoch": 0.011267605633802818, "grad_norm": 1.1895703077316284, "learning_rate": 3.8e-07, "loss": 0.0089, "step": 20 }, { "epoch": 0.016901408450704224, "grad_norm": 0.9030239582061768, "learning_rate": 5.800000000000001e-07, "loss": 0.0078, "step": 30 }, { "epoch": 0.022535211267605635, "grad_norm": 1.161505103111267, "learning_rate": 7.8e-07, "loss": 0.0101, "step": 40 }, { "epoch": 0.028169014084507043, "grad_norm": 0.7956001162528992, "learning_rate": 9.800000000000001e-07, "loss": 0.0087, "step": 50 }, { "epoch": 0.03380281690140845, "grad_norm": 0.9211345911026001, "learning_rate": 1.1800000000000001e-06, "loss": 0.0083, "step": 60 }, { "epoch": 0.03943661971830986, "grad_norm": 1.4743120670318604, "learning_rate": 1.3800000000000001e-06, "loss": 0.0125, "step": 70 }, { "epoch": 0.04507042253521127, "grad_norm": 2.1086606979370117, "learning_rate": 1.5800000000000001e-06, "loss": 0.0072, "step": 80 }, { "epoch": 0.05070422535211268, "grad_norm": 0.6348907351493835, "learning_rate": 1.7800000000000001e-06, "loss": 0.0054, "step": 90 }, { "epoch": 0.056338028169014086, "grad_norm": 0.7495781779289246, "learning_rate": 1.98e-06, "loss": 0.0089, "step": 100 }, { "epoch": 0.061971830985915494, "grad_norm": 0.7683187127113342, "learning_rate": 2.1800000000000003e-06, "loss": 0.0112, "step": 110 }, { "epoch": 0.0676056338028169, "grad_norm": 0.9579158425331116, "learning_rate": 2.38e-06, "loss": 0.0117, "step": 120 }, { "epoch": 0.07323943661971831, "grad_norm": 0.5810931324958801, "learning_rate": 2.5800000000000003e-06, "loss": 0.0101, "step": 130 }, { "epoch": 0.07887323943661972, "grad_norm": 0.24839498102664948, "learning_rate": 2.7800000000000005e-06, "loss": 0.0064, "step": 140 }, { "epoch": 0.08450704225352113, "grad_norm": 1.1974351406097412, "learning_rate": 2.9800000000000003e-06, "loss": 0.0054, "step": 150 }, { "epoch": 0.09014084507042254, "grad_norm": 2.2109923362731934, "learning_rate": 3.1800000000000005e-06, "loss": 0.0075, "step": 160 }, { "epoch": 0.09577464788732394, "grad_norm": 0.7371222972869873, "learning_rate": 3.3800000000000007e-06, "loss": 0.0107, "step": 170 }, { "epoch": 0.10140845070422536, "grad_norm": 1.1431416273117065, "learning_rate": 3.58e-06, "loss": 0.0072, "step": 180 }, { "epoch": 0.10704225352112676, "grad_norm": 1.5261584520339966, "learning_rate": 3.7800000000000002e-06, "loss": 0.0109, "step": 190 }, { "epoch": 0.11267605633802817, "grad_norm": 0.6876276731491089, "learning_rate": 3.980000000000001e-06, "loss": 0.0083, "step": 200 }, { "epoch": 0.11830985915492957, "grad_norm": 0.5882245898246765, "learning_rate": 4.18e-06, "loss": 0.0109, "step": 210 }, { "epoch": 0.12394366197183099, "grad_norm": 1.0145407915115356, "learning_rate": 4.38e-06, "loss": 0.0105, "step": 220 }, { "epoch": 0.1295774647887324, "grad_norm": 1.5309728384017944, "learning_rate": 4.58e-06, "loss": 0.0074, "step": 230 }, { "epoch": 0.1352112676056338, "grad_norm": 0.7701055407524109, "learning_rate": 4.78e-06, "loss": 0.009, "step": 240 }, { "epoch": 0.14084507042253522, "grad_norm": 1.1244176626205444, "learning_rate": 4.980000000000001e-06, "loss": 0.0106, "step": 250 }, { "epoch": 0.14647887323943662, "grad_norm": 1.3951512575149536, "learning_rate": 5.18e-06, "loss": 0.0083, "step": 260 }, { "epoch": 0.15211267605633802, "grad_norm": 0.7924229502677917, "learning_rate": 5.380000000000001e-06, "loss": 0.0063, "step": 270 }, { "epoch": 0.15774647887323945, "grad_norm": 0.5757330656051636, "learning_rate": 5.580000000000001e-06, "loss": 0.0093, "step": 280 }, { "epoch": 0.16338028169014085, "grad_norm": 1.189134120941162, "learning_rate": 5.78e-06, "loss": 0.0097, "step": 290 }, { "epoch": 0.16901408450704225, "grad_norm": 1.2784746885299683, "learning_rate": 5.98e-06, "loss": 0.0099, "step": 300 }, { "epoch": 0.17464788732394365, "grad_norm": 1.6380332708358765, "learning_rate": 6.18e-06, "loss": 0.0075, "step": 310 }, { "epoch": 0.18028169014084508, "grad_norm": 0.9639184474945068, "learning_rate": 6.380000000000001e-06, "loss": 0.0101, "step": 320 }, { "epoch": 0.18591549295774648, "grad_norm": 1.5722377300262451, "learning_rate": 6.5800000000000005e-06, "loss": 0.0105, "step": 330 }, { "epoch": 0.19154929577464788, "grad_norm": 3.752877712249756, "learning_rate": 6.780000000000001e-06, "loss": 0.0153, "step": 340 }, { "epoch": 0.19718309859154928, "grad_norm": 0.851811945438385, "learning_rate": 6.98e-06, "loss": 0.0175, "step": 350 }, { "epoch": 0.2028169014084507, "grad_norm": 1.006993055343628, "learning_rate": 7.180000000000001e-06, "loss": 0.009, "step": 360 }, { "epoch": 0.2084507042253521, "grad_norm": 1.064292550086975, "learning_rate": 7.3800000000000005e-06, "loss": 0.0098, "step": 370 }, { "epoch": 0.2140845070422535, "grad_norm": 1.17357337474823, "learning_rate": 7.58e-06, "loss": 0.0121, "step": 380 }, { "epoch": 0.21971830985915494, "grad_norm": 1.702314853668213, "learning_rate": 7.78e-06, "loss": 0.015, "step": 390 }, { "epoch": 0.22535211267605634, "grad_norm": 1.3108038902282715, "learning_rate": 7.980000000000002e-06, "loss": 0.0104, "step": 400 }, { "epoch": 0.23098591549295774, "grad_norm": 0.9514098167419434, "learning_rate": 8.18e-06, "loss": 0.009, "step": 410 }, { "epoch": 0.23661971830985915, "grad_norm": 1.800310492515564, "learning_rate": 8.380000000000001e-06, "loss": 0.0123, "step": 420 }, { "epoch": 0.24225352112676057, "grad_norm": 2.535449743270874, "learning_rate": 8.580000000000001e-06, "loss": 0.0137, "step": 430 }, { "epoch": 0.24788732394366197, "grad_norm": 1.0771689414978027, "learning_rate": 8.78e-06, "loss": 0.0161, "step": 440 }, { "epoch": 0.2535211267605634, "grad_norm": 0.4419674873352051, "learning_rate": 8.98e-06, "loss": 0.0125, "step": 450 }, { "epoch": 0.2591549295774648, "grad_norm": 1.9389402866363525, "learning_rate": 9.180000000000002e-06, "loss": 0.0115, "step": 460 }, { "epoch": 0.2647887323943662, "grad_norm": 5.996421813964844, "learning_rate": 9.38e-06, "loss": 0.0174, "step": 470 }, { "epoch": 0.2704225352112676, "grad_norm": 1.4965449571609497, "learning_rate": 9.58e-06, "loss": 0.0197, "step": 480 }, { "epoch": 0.27605633802816903, "grad_norm": 1.4441680908203125, "learning_rate": 9.780000000000001e-06, "loss": 0.0162, "step": 490 }, { "epoch": 0.28169014084507044, "grad_norm": 1.0549083948135376, "learning_rate": 9.980000000000001e-06, "loss": 0.0152, "step": 500 }, { "epoch": 0.28732394366197184, "grad_norm": 1.933992624282837, "learning_rate": 9.970491803278689e-06, "loss": 0.0146, "step": 510 }, { "epoch": 0.29295774647887324, "grad_norm": 1.2404003143310547, "learning_rate": 9.937704918032788e-06, "loss": 0.0159, "step": 520 }, { "epoch": 0.29859154929577464, "grad_norm": 1.868192434310913, "learning_rate": 9.904918032786887e-06, "loss": 0.0187, "step": 530 }, { "epoch": 0.30422535211267604, "grad_norm": 1.5579264163970947, "learning_rate": 9.872131147540984e-06, "loss": 0.0125, "step": 540 }, { "epoch": 0.30985915492957744, "grad_norm": 1.4725924730300903, "learning_rate": 9.839344262295083e-06, "loss": 0.0173, "step": 550 }, { "epoch": 0.3154929577464789, "grad_norm": 2.605309247970581, "learning_rate": 9.80655737704918e-06, "loss": 0.013, "step": 560 }, { "epoch": 0.3211267605633803, "grad_norm": 1.3504048585891724, "learning_rate": 9.77377049180328e-06, "loss": 0.0161, "step": 570 }, { "epoch": 0.3267605633802817, "grad_norm": 1.5028926134109497, "learning_rate": 9.740983606557379e-06, "loss": 0.0199, "step": 580 }, { "epoch": 0.3323943661971831, "grad_norm": 0.5192097425460815, "learning_rate": 9.708196721311476e-06, "loss": 0.0148, "step": 590 }, { "epoch": 0.3380281690140845, "grad_norm": 1.3670573234558105, "learning_rate": 9.675409836065575e-06, "loss": 0.0257, "step": 600 }, { "epoch": 0.3436619718309859, "grad_norm": 1.779189109802246, "learning_rate": 9.642622950819674e-06, "loss": 0.0163, "step": 610 }, { "epoch": 0.3492957746478873, "grad_norm": 1.8127739429473877, "learning_rate": 9.609836065573771e-06, "loss": 0.013, "step": 620 }, { "epoch": 0.35492957746478876, "grad_norm": 1.9041593074798584, "learning_rate": 9.57704918032787e-06, "loss": 0.0187, "step": 630 }, { "epoch": 0.36056338028169016, "grad_norm": 1.1961556673049927, "learning_rate": 9.544262295081968e-06, "loss": 0.0193, "step": 640 }, { "epoch": 0.36619718309859156, "grad_norm": 2.3215367794036865, "learning_rate": 9.511475409836067e-06, "loss": 0.0179, "step": 650 }, { "epoch": 0.37183098591549296, "grad_norm": 1.2701491117477417, "learning_rate": 9.478688524590164e-06, "loss": 0.0139, "step": 660 }, { "epoch": 0.37746478873239436, "grad_norm": 2.1906158924102783, "learning_rate": 9.445901639344263e-06, "loss": 0.0174, "step": 670 }, { "epoch": 0.38309859154929576, "grad_norm": 1.7713816165924072, "learning_rate": 9.413114754098362e-06, "loss": 0.0231, "step": 680 }, { "epoch": 0.38873239436619716, "grad_norm": 4.938525199890137, "learning_rate": 9.38032786885246e-06, "loss": 0.0165, "step": 690 }, { "epoch": 0.39436619718309857, "grad_norm": 1.2974543571472168, "learning_rate": 9.347540983606559e-06, "loss": 0.0173, "step": 700 }, { "epoch": 0.4, "grad_norm": 2.3422200679779053, "learning_rate": 9.314754098360656e-06, "loss": 0.0177, "step": 710 }, { "epoch": 0.4056338028169014, "grad_norm": 1.7533538341522217, "learning_rate": 9.281967213114755e-06, "loss": 0.0133, "step": 720 }, { "epoch": 0.4112676056338028, "grad_norm": 1.83546781539917, "learning_rate": 9.249180327868852e-06, "loss": 0.0198, "step": 730 }, { "epoch": 0.4169014084507042, "grad_norm": 3.8797242641448975, "learning_rate": 9.216393442622951e-06, "loss": 0.0137, "step": 740 }, { "epoch": 0.4225352112676056, "grad_norm": 0.6942452788352966, "learning_rate": 9.183606557377049e-06, "loss": 0.0131, "step": 750 }, { "epoch": 0.428169014084507, "grad_norm": 2.0836191177368164, "learning_rate": 9.150819672131148e-06, "loss": 0.022, "step": 760 }, { "epoch": 0.43380281690140843, "grad_norm": 1.7993505001068115, "learning_rate": 9.118032786885247e-06, "loss": 0.0177, "step": 770 }, { "epoch": 0.4394366197183099, "grad_norm": 1.7622896432876587, "learning_rate": 9.085245901639344e-06, "loss": 0.0191, "step": 780 }, { "epoch": 0.4450704225352113, "grad_norm": 1.080950140953064, "learning_rate": 9.052459016393443e-06, "loss": 0.0136, "step": 790 }, { "epoch": 0.4507042253521127, "grad_norm": 2.1891305446624756, "learning_rate": 9.019672131147542e-06, "loss": 0.0151, "step": 800 }, { "epoch": 0.4563380281690141, "grad_norm": 2.1063690185546875, "learning_rate": 8.98688524590164e-06, "loss": 0.0175, "step": 810 }, { "epoch": 0.4619718309859155, "grad_norm": 1.6590553522109985, "learning_rate": 8.954098360655739e-06, "loss": 0.0176, "step": 820 }, { "epoch": 0.4676056338028169, "grad_norm": 1.7408928871154785, "learning_rate": 8.921311475409838e-06, "loss": 0.0141, "step": 830 }, { "epoch": 0.4732394366197183, "grad_norm": 3.2451632022857666, "learning_rate": 8.888524590163935e-06, "loss": 0.0187, "step": 840 }, { "epoch": 0.4788732394366197, "grad_norm": 2.8567867279052734, "learning_rate": 8.855737704918034e-06, "loss": 0.0285, "step": 850 }, { "epoch": 0.48450704225352115, "grad_norm": 1.448118805885315, "learning_rate": 8.822950819672131e-06, "loss": 0.0118, "step": 860 }, { "epoch": 0.49014084507042255, "grad_norm": 1.4737434387207031, "learning_rate": 8.79016393442623e-06, "loss": 0.0149, "step": 870 }, { "epoch": 0.49577464788732395, "grad_norm": 3.3012146949768066, "learning_rate": 8.75737704918033e-06, "loss": 0.0182, "step": 880 }, { "epoch": 0.5014084507042254, "grad_norm": 1.027005910873413, "learning_rate": 8.724590163934427e-06, "loss": 0.0148, "step": 890 }, { "epoch": 0.5070422535211268, "grad_norm": 2.394969940185547, "learning_rate": 8.691803278688526e-06, "loss": 0.0182, "step": 900 }, { "epoch": 0.5126760563380282, "grad_norm": 1.4397178888320923, "learning_rate": 8.659016393442625e-06, "loss": 0.0206, "step": 910 }, { "epoch": 0.5183098591549296, "grad_norm": 2.706894636154175, "learning_rate": 8.626229508196722e-06, "loss": 0.0209, "step": 920 }, { "epoch": 0.523943661971831, "grad_norm": 3.6924495697021484, "learning_rate": 8.593442622950821e-06, "loss": 0.0231, "step": 930 }, { "epoch": 0.5295774647887324, "grad_norm": 2.320352554321289, "learning_rate": 8.560655737704918e-06, "loss": 0.0229, "step": 940 }, { "epoch": 0.5352112676056338, "grad_norm": 2.570531129837036, "learning_rate": 8.527868852459018e-06, "loss": 0.0157, "step": 950 }, { "epoch": 0.5408450704225352, "grad_norm": 2.1590864658355713, "learning_rate": 8.495081967213117e-06, "loss": 0.0162, "step": 960 }, { "epoch": 0.5464788732394367, "grad_norm": 1.717673659324646, "learning_rate": 8.462295081967214e-06, "loss": 0.019, "step": 970 }, { "epoch": 0.5521126760563381, "grad_norm": 1.1120901107788086, "learning_rate": 8.429508196721313e-06, "loss": 0.012, "step": 980 }, { "epoch": 0.5577464788732395, "grad_norm": 2.0220296382904053, "learning_rate": 8.39672131147541e-06, "loss": 0.0195, "step": 990 }, { "epoch": 0.5633802816901409, "grad_norm": 3.2409744262695312, "learning_rate": 8.36393442622951e-06, "loss": 0.0196, "step": 1000 }, { "epoch": 0.5690140845070423, "grad_norm": 1.8710452318191528, "learning_rate": 8.331147540983607e-06, "loss": 0.0216, "step": 1010 }, { "epoch": 0.5746478873239437, "grad_norm": 1.902570366859436, "learning_rate": 8.298360655737706e-06, "loss": 0.0121, "step": 1020 }, { "epoch": 0.5802816901408451, "grad_norm": 2.0836708545684814, "learning_rate": 8.265573770491803e-06, "loss": 0.0207, "step": 1030 }, { "epoch": 0.5859154929577465, "grad_norm": 0.8831673264503479, "learning_rate": 8.232786885245902e-06, "loss": 0.0189, "step": 1040 }, { "epoch": 0.5915492957746479, "grad_norm": 1.7548481225967407, "learning_rate": 8.2e-06, "loss": 0.0197, "step": 1050 }, { "epoch": 0.5971830985915493, "grad_norm": 1.3168854713439941, "learning_rate": 8.167213114754098e-06, "loss": 0.0149, "step": 1060 }, { "epoch": 0.6028169014084507, "grad_norm": 1.703010082244873, "learning_rate": 8.134426229508197e-06, "loss": 0.0184, "step": 1070 }, { "epoch": 0.6084507042253521, "grad_norm": 1.3345087766647339, "learning_rate": 8.101639344262295e-06, "loss": 0.0168, "step": 1080 }, { "epoch": 0.6140845070422535, "grad_norm": 1.4866513013839722, "learning_rate": 8.068852459016394e-06, "loss": 0.0159, "step": 1090 }, { "epoch": 0.6197183098591549, "grad_norm": 0.8655949234962463, "learning_rate": 8.036065573770493e-06, "loss": 0.0227, "step": 1100 }, { "epoch": 0.6253521126760564, "grad_norm": 1.8502163887023926, "learning_rate": 8.00327868852459e-06, "loss": 0.0213, "step": 1110 }, { "epoch": 0.6309859154929578, "grad_norm": 2.3221960067749023, "learning_rate": 7.97049180327869e-06, "loss": 0.0168, "step": 1120 }, { "epoch": 0.6366197183098592, "grad_norm": 0.6024265289306641, "learning_rate": 7.937704918032788e-06, "loss": 0.0119, "step": 1130 }, { "epoch": 0.6422535211267606, "grad_norm": 1.0888503789901733, "learning_rate": 7.904918032786886e-06, "loss": 0.018, "step": 1140 }, { "epoch": 0.647887323943662, "grad_norm": 1.887101173400879, "learning_rate": 7.872131147540985e-06, "loss": 0.0174, "step": 1150 }, { "epoch": 0.6535211267605634, "grad_norm": 1.6141228675842285, "learning_rate": 7.839344262295082e-06, "loss": 0.019, "step": 1160 }, { "epoch": 0.6591549295774648, "grad_norm": 0.9001067876815796, "learning_rate": 7.806557377049181e-06, "loss": 0.0145, "step": 1170 }, { "epoch": 0.6647887323943662, "grad_norm": 0.6219407320022583, "learning_rate": 7.77377049180328e-06, "loss": 0.0167, "step": 1180 }, { "epoch": 0.6704225352112676, "grad_norm": 1.1784660816192627, "learning_rate": 7.740983606557377e-06, "loss": 0.0152, "step": 1190 }, { "epoch": 0.676056338028169, "grad_norm": 1.0849504470825195, "learning_rate": 7.708196721311476e-06, "loss": 0.0181, "step": 1200 }, { "epoch": 0.6816901408450704, "grad_norm": 1.2452244758605957, "learning_rate": 7.675409836065576e-06, "loss": 0.0166, "step": 1210 }, { "epoch": 0.6873239436619718, "grad_norm": 1.5875732898712158, "learning_rate": 7.642622950819673e-06, "loss": 0.0161, "step": 1220 }, { "epoch": 0.6929577464788732, "grad_norm": 1.530847430229187, "learning_rate": 7.609836065573771e-06, "loss": 0.0201, "step": 1230 }, { "epoch": 0.6985915492957746, "grad_norm": 2.098750114440918, "learning_rate": 7.577049180327869e-06, "loss": 0.0161, "step": 1240 }, { "epoch": 0.704225352112676, "grad_norm": 0.9777464270591736, "learning_rate": 7.5442622950819674e-06, "loss": 0.0149, "step": 1250 }, { "epoch": 0.7098591549295775, "grad_norm": 1.5550135374069214, "learning_rate": 7.5114754098360665e-06, "loss": 0.0159, "step": 1260 }, { "epoch": 0.7154929577464789, "grad_norm": 1.6158477067947388, "learning_rate": 7.478688524590164e-06, "loss": 0.0143, "step": 1270 }, { "epoch": 0.7211267605633803, "grad_norm": 1.2116320133209229, "learning_rate": 7.445901639344263e-06, "loss": 0.014, "step": 1280 }, { "epoch": 0.7267605633802817, "grad_norm": 2.0365331172943115, "learning_rate": 7.413114754098362e-06, "loss": 0.0231, "step": 1290 }, { "epoch": 0.7323943661971831, "grad_norm": 2.1054487228393555, "learning_rate": 7.380327868852459e-06, "loss": 0.0256, "step": 1300 }, { "epoch": 0.7380281690140845, "grad_norm": 1.5217443704605103, "learning_rate": 7.347540983606558e-06, "loss": 0.017, "step": 1310 }, { "epoch": 0.7436619718309859, "grad_norm": 2.1168699264526367, "learning_rate": 7.314754098360657e-06, "loss": 0.0132, "step": 1320 }, { "epoch": 0.7492957746478873, "grad_norm": 2.137946844100952, "learning_rate": 7.281967213114755e-06, "loss": 0.0169, "step": 1330 }, { "epoch": 0.7549295774647887, "grad_norm": 3.1900794506073, "learning_rate": 7.249180327868854e-06, "loss": 0.0155, "step": 1340 }, { "epoch": 0.7605633802816901, "grad_norm": 1.0735390186309814, "learning_rate": 7.216393442622951e-06, "loss": 0.0213, "step": 1350 }, { "epoch": 0.7661971830985915, "grad_norm": 2.0629560947418213, "learning_rate": 7.18360655737705e-06, "loss": 0.0184, "step": 1360 }, { "epoch": 0.7718309859154929, "grad_norm": 1.6908258199691772, "learning_rate": 7.150819672131148e-06, "loss": 0.0206, "step": 1370 }, { "epoch": 0.7774647887323943, "grad_norm": 1.9119372367858887, "learning_rate": 7.118032786885246e-06, "loss": 0.0156, "step": 1380 }, { "epoch": 0.7830985915492957, "grad_norm": 2.6538517475128174, "learning_rate": 7.085245901639345e-06, "loss": 0.0218, "step": 1390 }, { "epoch": 0.7887323943661971, "grad_norm": 1.286156177520752, "learning_rate": 7.052459016393444e-06, "loss": 0.0138, "step": 1400 }, { "epoch": 0.7943661971830986, "grad_norm": 2.077173948287964, "learning_rate": 7.019672131147541e-06, "loss": 0.0214, "step": 1410 }, { "epoch": 0.8, "grad_norm": 0.7982816696166992, "learning_rate": 6.98688524590164e-06, "loss": 0.0188, "step": 1420 }, { "epoch": 0.8056338028169014, "grad_norm": 1.7048382759094238, "learning_rate": 6.954098360655737e-06, "loss": 0.0131, "step": 1430 }, { "epoch": 0.8112676056338028, "grad_norm": 1.4576507806777954, "learning_rate": 6.921311475409836e-06, "loss": 0.0147, "step": 1440 }, { "epoch": 0.8169014084507042, "grad_norm": 1.2167617082595825, "learning_rate": 6.8885245901639354e-06, "loss": 0.0158, "step": 1450 }, { "epoch": 0.8225352112676056, "grad_norm": 2.0346646308898926, "learning_rate": 6.855737704918033e-06, "loss": 0.0158, "step": 1460 }, { "epoch": 0.828169014084507, "grad_norm": 1.2361705303192139, "learning_rate": 6.822950819672132e-06, "loss": 0.0101, "step": 1470 }, { "epoch": 0.8338028169014085, "grad_norm": 1.916527271270752, "learning_rate": 6.790163934426231e-06, "loss": 0.0147, "step": 1480 }, { "epoch": 0.8394366197183099, "grad_norm": 2.0439751148223877, "learning_rate": 6.757377049180328e-06, "loss": 0.0172, "step": 1490 }, { "epoch": 0.8450704225352113, "grad_norm": 1.4808157682418823, "learning_rate": 6.724590163934427e-06, "loss": 0.0144, "step": 1500 }, { "epoch": 0.8507042253521127, "grad_norm": 2.236619234085083, "learning_rate": 6.691803278688525e-06, "loss": 0.0124, "step": 1510 }, { "epoch": 0.856338028169014, "grad_norm": 1.0215883255004883, "learning_rate": 6.659016393442624e-06, "loss": 0.019, "step": 1520 }, { "epoch": 0.8619718309859155, "grad_norm": 2.6691057682037354, "learning_rate": 6.626229508196722e-06, "loss": 0.0251, "step": 1530 }, { "epoch": 0.8676056338028169, "grad_norm": 1.466689944267273, "learning_rate": 6.59344262295082e-06, "loss": 0.0142, "step": 1540 }, { "epoch": 0.8732394366197183, "grad_norm": 1.797670841217041, "learning_rate": 6.560655737704918e-06, "loss": 0.0164, "step": 1550 }, { "epoch": 0.8788732394366198, "grad_norm": 2.59198260307312, "learning_rate": 6.527868852459017e-06, "loss": 0.0252, "step": 1560 }, { "epoch": 0.8845070422535212, "grad_norm": 2.8690240383148193, "learning_rate": 6.4950819672131145e-06, "loss": 0.0185, "step": 1570 }, { "epoch": 0.8901408450704226, "grad_norm": 1.5174709558486938, "learning_rate": 6.4622950819672136e-06, "loss": 0.0197, "step": 1580 }, { "epoch": 0.895774647887324, "grad_norm": 0.8220402598381042, "learning_rate": 6.429508196721313e-06, "loss": 0.0186, "step": 1590 }, { "epoch": 0.9014084507042254, "grad_norm": 1.6295188665390015, "learning_rate": 6.39672131147541e-06, "loss": 0.0183, "step": 1600 }, { "epoch": 0.9070422535211268, "grad_norm": 1.0216023921966553, "learning_rate": 6.363934426229509e-06, "loss": 0.0149, "step": 1610 }, { "epoch": 0.9126760563380282, "grad_norm": 1.4068635702133179, "learning_rate": 6.331147540983608e-06, "loss": 0.0113, "step": 1620 }, { "epoch": 0.9183098591549296, "grad_norm": 1.1198160648345947, "learning_rate": 6.298360655737705e-06, "loss": 0.0123, "step": 1630 }, { "epoch": 0.923943661971831, "grad_norm": 1.9338263273239136, "learning_rate": 6.265573770491804e-06, "loss": 0.0133, "step": 1640 }, { "epoch": 0.9295774647887324, "grad_norm": 1.9467333555221558, "learning_rate": 6.232786885245902e-06, "loss": 0.0168, "step": 1650 }, { "epoch": 0.9352112676056338, "grad_norm": 5.4310712814331055, "learning_rate": 6.200000000000001e-06, "loss": 0.0183, "step": 1660 }, { "epoch": 0.9408450704225352, "grad_norm": 7.56189489364624, "learning_rate": 6.167213114754099e-06, "loss": 0.0142, "step": 1670 }, { "epoch": 0.9464788732394366, "grad_norm": 1.2114770412445068, "learning_rate": 6.134426229508197e-06, "loss": 0.0153, "step": 1680 }, { "epoch": 0.952112676056338, "grad_norm": 0.7653253078460693, "learning_rate": 6.101639344262295e-06, "loss": 0.0214, "step": 1690 }, { "epoch": 0.9577464788732394, "grad_norm": 1.0881645679473877, "learning_rate": 6.068852459016394e-06, "loss": 0.0165, "step": 1700 }, { "epoch": 0.9633802816901409, "grad_norm": 1.2689759731292725, "learning_rate": 6.036065573770492e-06, "loss": 0.0129, "step": 1710 }, { "epoch": 0.9690140845070423, "grad_norm": 1.2382926940917969, "learning_rate": 6.003278688524591e-06, "loss": 0.013, "step": 1720 }, { "epoch": 0.9746478873239437, "grad_norm": 1.3045654296875, "learning_rate": 5.970491803278688e-06, "loss": 0.0149, "step": 1730 }, { "epoch": 0.9802816901408451, "grad_norm": 1.588781476020813, "learning_rate": 5.937704918032787e-06, "loss": 0.0151, "step": 1740 }, { "epoch": 0.9859154929577465, "grad_norm": 0.9168156981468201, "learning_rate": 5.904918032786886e-06, "loss": 0.0163, "step": 1750 }, { "epoch": 0.9915492957746479, "grad_norm": 1.5272772312164307, "learning_rate": 5.8721311475409835e-06, "loss": 0.0127, "step": 1760 }, { "epoch": 0.9971830985915493, "grad_norm": 2.0196149349212646, "learning_rate": 5.8393442622950825e-06, "loss": 0.0167, "step": 1770 }, { "epoch": 1.0028169014084507, "grad_norm": 1.132421612739563, "learning_rate": 5.806557377049182e-06, "loss": 0.0111, "step": 1780 }, { "epoch": 1.008450704225352, "grad_norm": 0.7580140233039856, "learning_rate": 5.773770491803279e-06, "loss": 0.0042, "step": 1790 }, { "epoch": 1.0140845070422535, "grad_norm": 0.7441082000732422, "learning_rate": 5.740983606557378e-06, "loss": 0.0055, "step": 1800 }, { "epoch": 1.019718309859155, "grad_norm": 0.689598798751831, "learning_rate": 5.708196721311476e-06, "loss": 0.0067, "step": 1810 }, { "epoch": 1.0253521126760563, "grad_norm": 1.3248918056488037, "learning_rate": 5.675409836065574e-06, "loss": 0.0071, "step": 1820 }, { "epoch": 1.0309859154929577, "grad_norm": 0.6853448152542114, "learning_rate": 5.6426229508196725e-06, "loss": 0.0043, "step": 1830 }, { "epoch": 1.036619718309859, "grad_norm": 0.718917191028595, "learning_rate": 5.609836065573771e-06, "loss": 0.0047, "step": 1840 }, { "epoch": 1.0422535211267605, "grad_norm": 0.6834334135055542, "learning_rate": 5.577049180327869e-06, "loss": 0.0047, "step": 1850 }, { "epoch": 1.047887323943662, "grad_norm": 0.4496719539165497, "learning_rate": 5.544262295081968e-06, "loss": 0.0045, "step": 1860 }, { "epoch": 1.0535211267605633, "grad_norm": 0.11652589589357376, "learning_rate": 5.511475409836065e-06, "loss": 0.0081, "step": 1870 }, { "epoch": 1.0591549295774647, "grad_norm": 0.28196972608566284, "learning_rate": 5.478688524590164e-06, "loss": 0.0064, "step": 1880 }, { "epoch": 1.064788732394366, "grad_norm": 0.2863517701625824, "learning_rate": 5.445901639344263e-06, "loss": 0.0066, "step": 1890 }, { "epoch": 1.0704225352112675, "grad_norm": 0.3156941831111908, "learning_rate": 5.413114754098361e-06, "loss": 0.004, "step": 1900 }, { "epoch": 1.076056338028169, "grad_norm": 1.2497857809066772, "learning_rate": 5.38032786885246e-06, "loss": 0.0082, "step": 1910 }, { "epoch": 1.0816901408450703, "grad_norm": 0.25184062123298645, "learning_rate": 5.347540983606557e-06, "loss": 0.0061, "step": 1920 }, { "epoch": 1.0873239436619717, "grad_norm": 0.2447606921195984, "learning_rate": 5.314754098360656e-06, "loss": 0.0035, "step": 1930 }, { "epoch": 1.0929577464788733, "grad_norm": 1.0463167428970337, "learning_rate": 5.281967213114755e-06, "loss": 0.0069, "step": 1940 }, { "epoch": 1.0985915492957747, "grad_norm": 0.7045953273773193, "learning_rate": 5.2491803278688525e-06, "loss": 0.0091, "step": 1950 }, { "epoch": 1.1042253521126761, "grad_norm": 2.306065559387207, "learning_rate": 5.2163934426229515e-06, "loss": 0.0063, "step": 1960 }, { "epoch": 1.1098591549295775, "grad_norm": 0.23815812170505524, "learning_rate": 5.18360655737705e-06, "loss": 0.007, "step": 1970 }, { "epoch": 1.115492957746479, "grad_norm": 0.8824997544288635, "learning_rate": 5.150819672131148e-06, "loss": 0.0035, "step": 1980 }, { "epoch": 1.1211267605633803, "grad_norm": 1.441972017288208, "learning_rate": 5.118032786885246e-06, "loss": 0.0046, "step": 1990 }, { "epoch": 1.1267605633802817, "grad_norm": 0.5692065954208374, "learning_rate": 5.085245901639345e-06, "loss": 0.0034, "step": 2000 }, { "epoch": 1.1267605633802817, "eval_loss": 0.006601665634661913, "eval_runtime": 10204.8454, "eval_samples_per_second": 1.391, "eval_steps_per_second": 0.174, "eval_wer": 0.6022624256040361, "step": 2000 }, { "epoch": 1.1323943661971831, "grad_norm": 1.9811164140701294, "learning_rate": 5.0524590163934425e-06, "loss": 0.0052, "step": 2010 }, { "epoch": 1.1380281690140845, "grad_norm": 1.9381415843963623, "learning_rate": 5.0196721311475415e-06, "loss": 0.0074, "step": 2020 }, { "epoch": 1.143661971830986, "grad_norm": 0.738838255405426, "learning_rate": 4.98688524590164e-06, "loss": 0.0057, "step": 2030 }, { "epoch": 1.1492957746478873, "grad_norm": 0.6865407824516296, "learning_rate": 4.954098360655738e-06, "loss": 0.0044, "step": 2040 }, { "epoch": 1.1549295774647887, "grad_norm": 0.4846172630786896, "learning_rate": 4.921311475409836e-06, "loss": 0.0064, "step": 2050 }, { "epoch": 1.1605633802816901, "grad_norm": 1.1418219804763794, "learning_rate": 4.888524590163935e-06, "loss": 0.0056, "step": 2060 }, { "epoch": 1.1661971830985915, "grad_norm": 0.5858073830604553, "learning_rate": 4.855737704918033e-06, "loss": 0.0079, "step": 2070 }, { "epoch": 1.171830985915493, "grad_norm": 0.661310613155365, "learning_rate": 4.8229508196721315e-06, "loss": 0.0048, "step": 2080 }, { "epoch": 1.1774647887323944, "grad_norm": 0.8238784670829773, "learning_rate": 4.79016393442623e-06, "loss": 0.0055, "step": 2090 }, { "epoch": 1.1830985915492958, "grad_norm": 1.049481987953186, "learning_rate": 4.757377049180329e-06, "loss": 0.0034, "step": 2100 }, { "epoch": 1.1887323943661972, "grad_norm": 1.2742713689804077, "learning_rate": 4.724590163934427e-06, "loss": 0.0061, "step": 2110 }, { "epoch": 1.1943661971830986, "grad_norm": 1.1974469423294067, "learning_rate": 4.691803278688525e-06, "loss": 0.0056, "step": 2120 }, { "epoch": 1.2, "grad_norm": 0.23014897108078003, "learning_rate": 4.659016393442623e-06, "loss": 0.0041, "step": 2130 }, { "epoch": 1.2056338028169014, "grad_norm": 0.2948243319988251, "learning_rate": 4.6262295081967215e-06, "loss": 0.0067, "step": 2140 }, { "epoch": 1.2112676056338028, "grad_norm": 0.5586726069450378, "learning_rate": 4.59344262295082e-06, "loss": 0.0074, "step": 2150 }, { "epoch": 1.2169014084507042, "grad_norm": 1.4075509309768677, "learning_rate": 4.560655737704918e-06, "loss": 0.0037, "step": 2160 }, { "epoch": 1.2225352112676056, "grad_norm": 0.9339226484298706, "learning_rate": 4.527868852459017e-06, "loss": 0.0055, "step": 2170 }, { "epoch": 1.228169014084507, "grad_norm": 1.2240931987762451, "learning_rate": 4.495081967213115e-06, "loss": 0.0043, "step": 2180 }, { "epoch": 1.2338028169014084, "grad_norm": 1.8513785600662231, "learning_rate": 4.462295081967213e-06, "loss": 0.0053, "step": 2190 }, { "epoch": 1.2394366197183098, "grad_norm": 1.798017144203186, "learning_rate": 4.4295081967213114e-06, "loss": 0.0038, "step": 2200 }, { "epoch": 1.2450704225352114, "grad_norm": 0.5782124400138855, "learning_rate": 4.3967213114754105e-06, "loss": 0.0046, "step": 2210 }, { "epoch": 1.2507042253521128, "grad_norm": 1.0610089302062988, "learning_rate": 4.363934426229509e-06, "loss": 0.0049, "step": 2220 }, { "epoch": 1.2563380281690142, "grad_norm": 0.3483627438545227, "learning_rate": 4.331147540983607e-06, "loss": 0.0057, "step": 2230 }, { "epoch": 1.2619718309859156, "grad_norm": 0.542745053768158, "learning_rate": 4.298360655737705e-06, "loss": 0.0048, "step": 2240 }, { "epoch": 1.267605633802817, "grad_norm": 0.8337019085884094, "learning_rate": 4.265573770491804e-06, "loss": 0.0035, "step": 2250 }, { "epoch": 1.2732394366197184, "grad_norm": 0.651922881603241, "learning_rate": 4.232786885245902e-06, "loss": 0.0045, "step": 2260 }, { "epoch": 1.2788732394366198, "grad_norm": 0.18958942592144012, "learning_rate": 4.2000000000000004e-06, "loss": 0.0038, "step": 2270 }, { "epoch": 1.2845070422535212, "grad_norm": 1.0927441120147705, "learning_rate": 4.167213114754099e-06, "loss": 0.0028, "step": 2280 }, { "epoch": 1.2901408450704226, "grad_norm": 0.5457190275192261, "learning_rate": 4.134426229508197e-06, "loss": 0.0051, "step": 2290 }, { "epoch": 1.295774647887324, "grad_norm": 1.858451247215271, "learning_rate": 4.101639344262295e-06, "loss": 0.0044, "step": 2300 }, { "epoch": 1.3014084507042254, "grad_norm": 1.4860936403274536, "learning_rate": 4.068852459016393e-06, "loss": 0.0039, "step": 2310 }, { "epoch": 1.3070422535211268, "grad_norm": 0.6476758718490601, "learning_rate": 4.036065573770492e-06, "loss": 0.0071, "step": 2320 }, { "epoch": 1.3126760563380282, "grad_norm": 1.3076200485229492, "learning_rate": 4.0032786885245904e-06, "loss": 0.0032, "step": 2330 }, { "epoch": 1.3183098591549296, "grad_norm": 0.8313450217247009, "learning_rate": 3.970491803278689e-06, "loss": 0.0038, "step": 2340 }, { "epoch": 1.323943661971831, "grad_norm": 0.3582000732421875, "learning_rate": 3.937704918032787e-06, "loss": 0.0024, "step": 2350 }, { "epoch": 1.3295774647887324, "grad_norm": 0.40098199248313904, "learning_rate": 3.904918032786886e-06, "loss": 0.0036, "step": 2360 }, { "epoch": 1.3352112676056338, "grad_norm": 1.560827374458313, "learning_rate": 3.872131147540984e-06, "loss": 0.004, "step": 2370 }, { "epoch": 1.3408450704225352, "grad_norm": 1.1103460788726807, "learning_rate": 3.839344262295082e-06, "loss": 0.0049, "step": 2380 }, { "epoch": 1.3464788732394366, "grad_norm": 1.1305570602416992, "learning_rate": 3.8065573770491804e-06, "loss": 0.0075, "step": 2390 }, { "epoch": 1.352112676056338, "grad_norm": 0.5738380551338196, "learning_rate": 3.773770491803279e-06, "loss": 0.003, "step": 2400 }, { "epoch": 1.3577464788732394, "grad_norm": 0.09693071246147156, "learning_rate": 3.740983606557377e-06, "loss": 0.0038, "step": 2410 }, { "epoch": 1.3633802816901408, "grad_norm": 0.3580220639705658, "learning_rate": 3.7081967213114754e-06, "loss": 0.0032, "step": 2420 }, { "epoch": 1.3690140845070422, "grad_norm": 0.6892378330230713, "learning_rate": 3.6754098360655744e-06, "loss": 0.0031, "step": 2430 }, { "epoch": 1.3746478873239436, "grad_norm": 0.6783043146133423, "learning_rate": 3.6426229508196726e-06, "loss": 0.0038, "step": 2440 }, { "epoch": 1.380281690140845, "grad_norm": 0.3925527036190033, "learning_rate": 3.609836065573771e-06, "loss": 0.0041, "step": 2450 }, { "epoch": 1.3859154929577464, "grad_norm": 0.9464495182037354, "learning_rate": 3.577049180327869e-06, "loss": 0.007, "step": 2460 }, { "epoch": 1.3915492957746478, "grad_norm": 0.13250884413719177, "learning_rate": 3.5442622950819676e-06, "loss": 0.0045, "step": 2470 }, { "epoch": 1.3971830985915492, "grad_norm": 0.9196420311927795, "learning_rate": 3.511475409836066e-06, "loss": 0.0036, "step": 2480 }, { "epoch": 1.4028169014084506, "grad_norm": 0.21822892129421234, "learning_rate": 3.478688524590164e-06, "loss": 0.0038, "step": 2490 }, { "epoch": 1.408450704225352, "grad_norm": 0.4487333297729492, "learning_rate": 3.445901639344262e-06, "loss": 0.0033, "step": 2500 }, { "epoch": 1.4140845070422534, "grad_norm": 0.37971436977386475, "learning_rate": 3.413114754098361e-06, "loss": 0.0039, "step": 2510 }, { "epoch": 1.4197183098591548, "grad_norm": 0.2940213978290558, "learning_rate": 3.3803278688524594e-06, "loss": 0.0088, "step": 2520 }, { "epoch": 1.4253521126760562, "grad_norm": 0.3230305314064026, "learning_rate": 3.3475409836065576e-06, "loss": 0.004, "step": 2530 }, { "epoch": 1.4309859154929576, "grad_norm": 0.3056629002094269, "learning_rate": 3.3147540983606558e-06, "loss": 0.0035, "step": 2540 }, { "epoch": 1.436619718309859, "grad_norm": 0.2812763452529907, "learning_rate": 3.2819672131147544e-06, "loss": 0.0048, "step": 2550 }, { "epoch": 1.4422535211267606, "grad_norm": 0.24654440581798553, "learning_rate": 3.2491803278688526e-06, "loss": 0.0052, "step": 2560 }, { "epoch": 1.447887323943662, "grad_norm": 1.0222922563552856, "learning_rate": 3.2163934426229508e-06, "loss": 0.0057, "step": 2570 }, { "epoch": 1.4535211267605634, "grad_norm": 0.288112998008728, "learning_rate": 3.18360655737705e-06, "loss": 0.0031, "step": 2580 }, { "epoch": 1.4591549295774648, "grad_norm": 1.0873970985412598, "learning_rate": 3.150819672131148e-06, "loss": 0.0072, "step": 2590 }, { "epoch": 1.4647887323943662, "grad_norm": 0.48980048298835754, "learning_rate": 3.118032786885246e-06, "loss": 0.0039, "step": 2600 }, { "epoch": 1.4704225352112676, "grad_norm": 0.40013420581817627, "learning_rate": 3.0852459016393444e-06, "loss": 0.0026, "step": 2610 }, { "epoch": 1.476056338028169, "grad_norm": 0.2551349103450775, "learning_rate": 3.052459016393443e-06, "loss": 0.0061, "step": 2620 }, { "epoch": 1.4816901408450704, "grad_norm": 0.8691345453262329, "learning_rate": 3.019672131147541e-06, "loss": 0.0055, "step": 2630 }, { "epoch": 1.4873239436619718, "grad_norm": 1.7804378271102905, "learning_rate": 2.9868852459016394e-06, "loss": 0.0037, "step": 2640 }, { "epoch": 1.4929577464788732, "grad_norm": 0.4038809835910797, "learning_rate": 2.9540983606557375e-06, "loss": 0.0058, "step": 2650 }, { "epoch": 1.4985915492957746, "grad_norm": 0.16689522564411163, "learning_rate": 2.9213114754098366e-06, "loss": 0.0051, "step": 2660 }, { "epoch": 1.504225352112676, "grad_norm": 1.197608232498169, "learning_rate": 2.8885245901639348e-06, "loss": 0.0065, "step": 2670 }, { "epoch": 1.5098591549295775, "grad_norm": 0.7976368069648743, "learning_rate": 2.855737704918033e-06, "loss": 0.004, "step": 2680 }, { "epoch": 1.5154929577464789, "grad_norm": 0.21786408126354218, "learning_rate": 2.822950819672131e-06, "loss": 0.0048, "step": 2690 }, { "epoch": 1.5211267605633803, "grad_norm": 0.19645968079566956, "learning_rate": 2.7901639344262298e-06, "loss": 0.0038, "step": 2700 }, { "epoch": 1.5267605633802817, "grad_norm": 0.29505646228790283, "learning_rate": 2.757377049180328e-06, "loss": 0.0034, "step": 2710 }, { "epoch": 1.532394366197183, "grad_norm": 1.0378403663635254, "learning_rate": 2.724590163934426e-06, "loss": 0.004, "step": 2720 }, { "epoch": 1.5380281690140845, "grad_norm": 0.23801620304584503, "learning_rate": 2.6918032786885247e-06, "loss": 0.0031, "step": 2730 }, { "epoch": 1.543661971830986, "grad_norm": 0.5884902477264404, "learning_rate": 2.6590163934426234e-06, "loss": 0.0025, "step": 2740 }, { "epoch": 1.5492957746478875, "grad_norm": 0.492400586605072, "learning_rate": 2.6262295081967215e-06, "loss": 0.0029, "step": 2750 }, { "epoch": 1.5549295774647889, "grad_norm": 0.6120467185974121, "learning_rate": 2.5934426229508197e-06, "loss": 0.0035, "step": 2760 }, { "epoch": 1.5605633802816903, "grad_norm": 0.5753558278083801, "learning_rate": 2.5606557377049183e-06, "loss": 0.0026, "step": 2770 }, { "epoch": 1.5661971830985917, "grad_norm": 0.6716064214706421, "learning_rate": 2.5278688524590165e-06, "loss": 0.0031, "step": 2780 }, { "epoch": 1.571830985915493, "grad_norm": 0.5276569128036499, "learning_rate": 2.4950819672131147e-06, "loss": 0.0046, "step": 2790 }, { "epoch": 1.5774647887323945, "grad_norm": 0.2546756863594055, "learning_rate": 2.4622950819672133e-06, "loss": 0.0033, "step": 2800 }, { "epoch": 1.5830985915492959, "grad_norm": 0.40880855917930603, "learning_rate": 2.4295081967213115e-06, "loss": 0.0021, "step": 2810 }, { "epoch": 1.5887323943661973, "grad_norm": 0.5505129098892212, "learning_rate": 2.39672131147541e-06, "loss": 0.0061, "step": 2820 }, { "epoch": 1.5943661971830987, "grad_norm": 0.5350093245506287, "learning_rate": 2.3639344262295083e-06, "loss": 0.0025, "step": 2830 }, { "epoch": 1.6, "grad_norm": 0.5841212272644043, "learning_rate": 2.331147540983607e-06, "loss": 0.0054, "step": 2840 }, { "epoch": 1.6056338028169015, "grad_norm": 0.5653055310249329, "learning_rate": 2.298360655737705e-06, "loss": 0.0045, "step": 2850 }, { "epoch": 1.611267605633803, "grad_norm": 1.1292093992233276, "learning_rate": 2.2655737704918033e-06, "loss": 0.0027, "step": 2860 }, { "epoch": 1.6169014084507043, "grad_norm": 0.5028222799301147, "learning_rate": 2.2327868852459015e-06, "loss": 0.0034, "step": 2870 }, { "epoch": 1.6225352112676057, "grad_norm": 0.5323085188865662, "learning_rate": 2.2e-06, "loss": 0.0036, "step": 2880 }, { "epoch": 1.628169014084507, "grad_norm": 0.9812871813774109, "learning_rate": 2.1672131147540983e-06, "loss": 0.0035, "step": 2890 }, { "epoch": 1.6338028169014085, "grad_norm": 1.2383418083190918, "learning_rate": 2.134426229508197e-06, "loss": 0.0034, "step": 2900 }, { "epoch": 1.63943661971831, "grad_norm": 0.6658313870429993, "learning_rate": 2.1016393442622955e-06, "loss": 0.0039, "step": 2910 }, { "epoch": 1.6450704225352113, "grad_norm": 0.8102785348892212, "learning_rate": 2.0688524590163937e-06, "loss": 0.0053, "step": 2920 }, { "epoch": 1.6507042253521127, "grad_norm": 1.0971978902816772, "learning_rate": 2.036065573770492e-06, "loss": 0.0025, "step": 2930 }, { "epoch": 1.656338028169014, "grad_norm": 1.2116867303848267, "learning_rate": 2.00327868852459e-06, "loss": 0.0046, "step": 2940 }, { "epoch": 1.6619718309859155, "grad_norm": 0.9780658483505249, "learning_rate": 1.9704918032786887e-06, "loss": 0.003, "step": 2950 }, { "epoch": 1.667605633802817, "grad_norm": 0.24054431915283203, "learning_rate": 1.937704918032787e-06, "loss": 0.003, "step": 2960 }, { "epoch": 1.6732394366197183, "grad_norm": 0.6839045882225037, "learning_rate": 1.9049180327868855e-06, "loss": 0.0047, "step": 2970 }, { "epoch": 1.6788732394366197, "grad_norm": 1.620792031288147, "learning_rate": 1.8721311475409837e-06, "loss": 0.0079, "step": 2980 }, { "epoch": 1.684507042253521, "grad_norm": 0.45428532361984253, "learning_rate": 1.839344262295082e-06, "loss": 0.0027, "step": 2990 }, { "epoch": 1.6901408450704225, "grad_norm": 0.6544379591941833, "learning_rate": 1.8065573770491803e-06, "loss": 0.0037, "step": 3000 }, { "epoch": 1.695774647887324, "grad_norm": 0.38664039969444275, "learning_rate": 1.7737704918032789e-06, "loss": 0.0037, "step": 3010 }, { "epoch": 1.7014084507042253, "grad_norm": 0.975898027420044, "learning_rate": 1.740983606557377e-06, "loss": 0.0051, "step": 3020 }, { "epoch": 1.7070422535211267, "grad_norm": 0.5585213899612427, "learning_rate": 1.7081967213114755e-06, "loss": 0.0038, "step": 3030 }, { "epoch": 1.712676056338028, "grad_norm": 0.277058482170105, "learning_rate": 1.6754098360655739e-06, "loss": 0.0019, "step": 3040 }, { "epoch": 1.7183098591549295, "grad_norm": 0.4337478578090668, "learning_rate": 1.6426229508196723e-06, "loss": 0.0033, "step": 3050 }, { "epoch": 1.723943661971831, "grad_norm": 0.6622017621994019, "learning_rate": 1.6098360655737707e-06, "loss": 0.0043, "step": 3060 }, { "epoch": 1.7295774647887323, "grad_norm": 0.10237481445074081, "learning_rate": 1.5770491803278689e-06, "loss": 0.0052, "step": 3070 }, { "epoch": 1.7352112676056337, "grad_norm": 0.21056203544139862, "learning_rate": 1.5442622950819675e-06, "loss": 0.003, "step": 3080 }, { "epoch": 1.7408450704225351, "grad_norm": 0.6057843565940857, "learning_rate": 1.5114754098360657e-06, "loss": 0.0037, "step": 3090 }, { "epoch": 1.7464788732394365, "grad_norm": 1.0593922138214111, "learning_rate": 1.478688524590164e-06, "loss": 0.0043, "step": 3100 }, { "epoch": 1.752112676056338, "grad_norm": 0.18801669776439667, "learning_rate": 1.4459016393442623e-06, "loss": 0.0029, "step": 3110 }, { "epoch": 1.7577464788732393, "grad_norm": 1.073035478591919, "learning_rate": 1.4131147540983609e-06, "loss": 0.0046, "step": 3120 }, { "epoch": 1.7633802816901407, "grad_norm": 0.9961406588554382, "learning_rate": 1.380327868852459e-06, "loss": 0.0042, "step": 3130 }, { "epoch": 1.7690140845070421, "grad_norm": 0.6435835957527161, "learning_rate": 1.3475409836065575e-06, "loss": 0.0048, "step": 3140 }, { "epoch": 1.7746478873239435, "grad_norm": 0.7369032502174377, "learning_rate": 1.3147540983606559e-06, "loss": 0.0047, "step": 3150 }, { "epoch": 1.780281690140845, "grad_norm": 1.5489526987075806, "learning_rate": 1.2819672131147543e-06, "loss": 0.0082, "step": 3160 }, { "epoch": 1.7859154929577463, "grad_norm": 0.20381329953670502, "learning_rate": 1.2491803278688527e-06, "loss": 0.0021, "step": 3170 }, { "epoch": 1.7915492957746477, "grad_norm": 0.2610226571559906, "learning_rate": 1.2163934426229509e-06, "loss": 0.0028, "step": 3180 }, { "epoch": 1.7971830985915493, "grad_norm": 0.378866046667099, "learning_rate": 1.1836065573770493e-06, "loss": 0.0028, "step": 3190 }, { "epoch": 1.8028169014084507, "grad_norm": 0.5652082562446594, "learning_rate": 1.1508196721311477e-06, "loss": 0.0033, "step": 3200 }, { "epoch": 1.8084507042253521, "grad_norm": 0.3631172180175781, "learning_rate": 1.118032786885246e-06, "loss": 0.0029, "step": 3210 }, { "epoch": 1.8140845070422535, "grad_norm": 0.40755710005760193, "learning_rate": 1.0852459016393442e-06, "loss": 0.0038, "step": 3220 }, { "epoch": 1.819718309859155, "grad_norm": 0.4334737956523895, "learning_rate": 1.0524590163934426e-06, "loss": 0.0037, "step": 3230 }, { "epoch": 1.8253521126760563, "grad_norm": 0.5602841377258301, "learning_rate": 1.019672131147541e-06, "loss": 0.0024, "step": 3240 }, { "epoch": 1.8309859154929577, "grad_norm": 1.8210504055023193, "learning_rate": 9.868852459016394e-07, "loss": 0.0027, "step": 3250 }, { "epoch": 1.8366197183098592, "grad_norm": 0.1980561465024948, "learning_rate": 9.540983606557378e-07, "loss": 0.005, "step": 3260 }, { "epoch": 1.8422535211267606, "grad_norm": 0.10875657945871353, "learning_rate": 9.213114754098361e-07, "loss": 0.0031, "step": 3270 }, { "epoch": 1.847887323943662, "grad_norm": 1.1292788982391357, "learning_rate": 8.885245901639344e-07, "loss": 0.0074, "step": 3280 }, { "epoch": 1.8535211267605634, "grad_norm": 1.2666053771972656, "learning_rate": 8.557377049180328e-07, "loss": 0.0031, "step": 3290 }, { "epoch": 1.8591549295774648, "grad_norm": 1.206641435623169, "learning_rate": 8.229508196721312e-07, "loss": 0.0043, "step": 3300 }, { "epoch": 1.8647887323943662, "grad_norm": 0.4260517656803131, "learning_rate": 7.901639344262296e-07, "loss": 0.0025, "step": 3310 }, { "epoch": 1.8704225352112676, "grad_norm": 0.22786042094230652, "learning_rate": 7.573770491803279e-07, "loss": 0.0034, "step": 3320 }, { "epoch": 1.8760563380281692, "grad_norm": 0.25025302171707153, "learning_rate": 7.245901639344263e-07, "loss": 0.0052, "step": 3330 }, { "epoch": 1.8816901408450706, "grad_norm": 0.051826998591423035, "learning_rate": 6.918032786885247e-07, "loss": 0.0031, "step": 3340 }, { "epoch": 1.887323943661972, "grad_norm": 1.0378538370132446, "learning_rate": 6.59016393442623e-07, "loss": 0.0046, "step": 3350 }, { "epoch": 1.8929577464788734, "grad_norm": 0.21233128011226654, "learning_rate": 6.262295081967214e-07, "loss": 0.0046, "step": 3360 }, { "epoch": 1.8985915492957748, "grad_norm": 0.12382949888706207, "learning_rate": 5.934426229508197e-07, "loss": 0.0024, "step": 3370 }, { "epoch": 1.9042253521126762, "grad_norm": 0.5659042596817017, "learning_rate": 5.606557377049181e-07, "loss": 0.0033, "step": 3380 }, { "epoch": 1.9098591549295776, "grad_norm": 0.7022256255149841, "learning_rate": 5.278688524590164e-07, "loss": 0.0032, "step": 3390 }, { "epoch": 1.915492957746479, "grad_norm": 0.71797114610672, "learning_rate": 4.950819672131148e-07, "loss": 0.0028, "step": 3400 }, { "epoch": 1.9211267605633804, "grad_norm": 0.709237277507782, "learning_rate": 4.622950819672131e-07, "loss": 0.0056, "step": 3410 }, { "epoch": 1.9267605633802818, "grad_norm": 0.5479326248168945, "learning_rate": 4.2950819672131156e-07, "loss": 0.0023, "step": 3420 }, { "epoch": 1.9323943661971832, "grad_norm": 0.1909080147743225, "learning_rate": 3.967213114754099e-07, "loss": 0.0034, "step": 3430 }, { "epoch": 1.9380281690140846, "grad_norm": 0.8031314611434937, "learning_rate": 3.6393442622950825e-07, "loss": 0.0029, "step": 3440 }, { "epoch": 1.943661971830986, "grad_norm": 0.4985907971858978, "learning_rate": 3.311475409836066e-07, "loss": 0.0043, "step": 3450 }, { "epoch": 1.9492957746478874, "grad_norm": 0.36522606015205383, "learning_rate": 2.9836065573770495e-07, "loss": 0.0041, "step": 3460 }, { "epoch": 1.9549295774647888, "grad_norm": 0.38089874386787415, "learning_rate": 2.655737704918033e-07, "loss": 0.0048, "step": 3470 }, { "epoch": 1.9605633802816902, "grad_norm": 1.044329047203064, "learning_rate": 2.3278688524590167e-07, "loss": 0.003, "step": 3480 }, { "epoch": 1.9661971830985916, "grad_norm": 0.798667848110199, "learning_rate": 2.0000000000000002e-07, "loss": 0.0021, "step": 3490 }, { "epoch": 1.971830985915493, "grad_norm": 0.1679508090019226, "learning_rate": 1.6721311475409836e-07, "loss": 0.0019, "step": 3500 }, { "epoch": 1.9774647887323944, "grad_norm": 0.15312330424785614, "learning_rate": 1.3442622950819674e-07, "loss": 0.0026, "step": 3510 }, { "epoch": 1.9830985915492958, "grad_norm": 0.38976049423217773, "learning_rate": 1.0163934426229509e-07, "loss": 0.0038, "step": 3520 }, { "epoch": 1.9887323943661972, "grad_norm": 0.8329457640647888, "learning_rate": 6.885245901639345e-08, "loss": 0.0038, "step": 3530 }, { "epoch": 1.9943661971830986, "grad_norm": 0.30528759956359863, "learning_rate": 3.606557377049181e-08, "loss": 0.0027, "step": 3540 }, { "epoch": 2.0, "grad_norm": 0.49645933508872986, "learning_rate": 3.278688524590164e-09, "loss": 0.0033, "step": 3550 } ], "logging_steps": 10, "max_steps": 3550, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1593663507464192e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }