| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.6022624256040361, | |
| "best_model_checkpoint": "./SALAMA_NEWMEDT/checkpoint-2000", | |
| "epoch": 1.1267605633802817, | |
| "eval_steps": 2000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005633802816901409, | |
| "grad_norm": 1.2029443979263306, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0085, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.011267605633802818, | |
| "grad_norm": 1.1895703077316284, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.0089, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.016901408450704224, | |
| "grad_norm": 0.9030239582061768, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.0078, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.022535211267605635, | |
| "grad_norm": 1.161505103111267, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.0101, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.028169014084507043, | |
| "grad_norm": 0.7956001162528992, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.0087, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03380281690140845, | |
| "grad_norm": 0.9211345911026001, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.0083, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03943661971830986, | |
| "grad_norm": 1.4743120670318604, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.0125, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04507042253521127, | |
| "grad_norm": 2.1086606979370117, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.0072, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05070422535211268, | |
| "grad_norm": 0.6348907351493835, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.0054, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.056338028169014086, | |
| "grad_norm": 0.7495781779289246, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.0089, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.061971830985915494, | |
| "grad_norm": 0.7683187127113342, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.0112, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0676056338028169, | |
| "grad_norm": 0.9579158425331116, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.0117, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07323943661971831, | |
| "grad_norm": 0.5810931324958801, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.0101, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07887323943661972, | |
| "grad_norm": 0.24839498102664948, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.0064, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08450704225352113, | |
| "grad_norm": 1.1974351406097412, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.0054, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09014084507042254, | |
| "grad_norm": 2.2109923362731934, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.0075, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09577464788732394, | |
| "grad_norm": 0.7371222972869873, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.0107, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10140845070422536, | |
| "grad_norm": 1.1431416273117065, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.0072, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10704225352112676, | |
| "grad_norm": 1.5261584520339966, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.0109, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.11267605633802817, | |
| "grad_norm": 0.6876276731491089, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.0083, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11830985915492957, | |
| "grad_norm": 0.5882245898246765, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.0109, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12394366197183099, | |
| "grad_norm": 1.0145407915115356, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.0105, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1295774647887324, | |
| "grad_norm": 1.5309728384017944, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.0074, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1352112676056338, | |
| "grad_norm": 0.7701055407524109, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.009, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14084507042253522, | |
| "grad_norm": 1.1244176626205444, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.0106, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14647887323943662, | |
| "grad_norm": 1.3951512575149536, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.0083, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15211267605633802, | |
| "grad_norm": 0.7924229502677917, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.0063, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15774647887323945, | |
| "grad_norm": 0.5757330656051636, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.0093, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16338028169014085, | |
| "grad_norm": 1.189134120941162, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.0097, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16901408450704225, | |
| "grad_norm": 1.2784746885299683, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.0099, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17464788732394365, | |
| "grad_norm": 1.6380332708358765, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.0075, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18028169014084508, | |
| "grad_norm": 0.9639184474945068, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.0101, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18591549295774648, | |
| "grad_norm": 1.5722377300262451, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.0105, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.19154929577464788, | |
| "grad_norm": 3.752877712249756, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.0153, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19718309859154928, | |
| "grad_norm": 0.851811945438385, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.0175, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2028169014084507, | |
| "grad_norm": 1.006993055343628, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.009, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2084507042253521, | |
| "grad_norm": 1.064292550086975, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.0098, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2140845070422535, | |
| "grad_norm": 1.17357337474823, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.0121, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.21971830985915494, | |
| "grad_norm": 1.702314853668213, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.015, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.22535211267605634, | |
| "grad_norm": 1.3108038902282715, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.0104, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23098591549295774, | |
| "grad_norm": 0.9514098167419434, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.009, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.23661971830985915, | |
| "grad_norm": 1.800310492515564, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.0123, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.24225352112676057, | |
| "grad_norm": 2.535449743270874, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.0137, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.24788732394366197, | |
| "grad_norm": 1.0771689414978027, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.0161, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2535211267605634, | |
| "grad_norm": 0.4419674873352051, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.0125, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2591549295774648, | |
| "grad_norm": 1.9389402866363525, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.0115, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2647887323943662, | |
| "grad_norm": 5.996421813964844, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.0174, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2704225352112676, | |
| "grad_norm": 1.4965449571609497, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.0197, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.27605633802816903, | |
| "grad_norm": 1.4441680908203125, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.0162, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.28169014084507044, | |
| "grad_norm": 1.0549083948135376, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.0152, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.28732394366197184, | |
| "grad_norm": 1.933992624282837, | |
| "learning_rate": 9.970491803278689e-06, | |
| "loss": 0.0146, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.29295774647887324, | |
| "grad_norm": 1.2404003143310547, | |
| "learning_rate": 9.937704918032788e-06, | |
| "loss": 0.0159, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.29859154929577464, | |
| "grad_norm": 1.868192434310913, | |
| "learning_rate": 9.904918032786887e-06, | |
| "loss": 0.0187, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.30422535211267604, | |
| "grad_norm": 1.5579264163970947, | |
| "learning_rate": 9.872131147540984e-06, | |
| "loss": 0.0125, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.30985915492957744, | |
| "grad_norm": 1.4725924730300903, | |
| "learning_rate": 9.839344262295083e-06, | |
| "loss": 0.0173, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3154929577464789, | |
| "grad_norm": 2.605309247970581, | |
| "learning_rate": 9.80655737704918e-06, | |
| "loss": 0.013, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3211267605633803, | |
| "grad_norm": 1.3504048585891724, | |
| "learning_rate": 9.77377049180328e-06, | |
| "loss": 0.0161, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3267605633802817, | |
| "grad_norm": 1.5028926134109497, | |
| "learning_rate": 9.740983606557379e-06, | |
| "loss": 0.0199, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3323943661971831, | |
| "grad_norm": 0.5192097425460815, | |
| "learning_rate": 9.708196721311476e-06, | |
| "loss": 0.0148, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3380281690140845, | |
| "grad_norm": 1.3670573234558105, | |
| "learning_rate": 9.675409836065575e-06, | |
| "loss": 0.0257, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3436619718309859, | |
| "grad_norm": 1.779189109802246, | |
| "learning_rate": 9.642622950819674e-06, | |
| "loss": 0.0163, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3492957746478873, | |
| "grad_norm": 1.8127739429473877, | |
| "learning_rate": 9.609836065573771e-06, | |
| "loss": 0.013, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.35492957746478876, | |
| "grad_norm": 1.9041593074798584, | |
| "learning_rate": 9.57704918032787e-06, | |
| "loss": 0.0187, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.36056338028169016, | |
| "grad_norm": 1.1961556673049927, | |
| "learning_rate": 9.544262295081968e-06, | |
| "loss": 0.0193, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.36619718309859156, | |
| "grad_norm": 2.3215367794036865, | |
| "learning_rate": 9.511475409836067e-06, | |
| "loss": 0.0179, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.37183098591549296, | |
| "grad_norm": 1.2701491117477417, | |
| "learning_rate": 9.478688524590164e-06, | |
| "loss": 0.0139, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.37746478873239436, | |
| "grad_norm": 2.1906158924102783, | |
| "learning_rate": 9.445901639344263e-06, | |
| "loss": 0.0174, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.38309859154929576, | |
| "grad_norm": 1.7713816165924072, | |
| "learning_rate": 9.413114754098362e-06, | |
| "loss": 0.0231, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.38873239436619716, | |
| "grad_norm": 4.938525199890137, | |
| "learning_rate": 9.38032786885246e-06, | |
| "loss": 0.0165, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.39436619718309857, | |
| "grad_norm": 1.2974543571472168, | |
| "learning_rate": 9.347540983606559e-06, | |
| "loss": 0.0173, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.3422200679779053, | |
| "learning_rate": 9.314754098360656e-06, | |
| "loss": 0.0177, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4056338028169014, | |
| "grad_norm": 1.7533538341522217, | |
| "learning_rate": 9.281967213114755e-06, | |
| "loss": 0.0133, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4112676056338028, | |
| "grad_norm": 1.83546781539917, | |
| "learning_rate": 9.249180327868852e-06, | |
| "loss": 0.0198, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4169014084507042, | |
| "grad_norm": 3.8797242641448975, | |
| "learning_rate": 9.216393442622951e-06, | |
| "loss": 0.0137, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4225352112676056, | |
| "grad_norm": 0.6942452788352966, | |
| "learning_rate": 9.183606557377049e-06, | |
| "loss": 0.0131, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.428169014084507, | |
| "grad_norm": 2.0836191177368164, | |
| "learning_rate": 9.150819672131148e-06, | |
| "loss": 0.022, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.43380281690140843, | |
| "grad_norm": 1.7993505001068115, | |
| "learning_rate": 9.118032786885247e-06, | |
| "loss": 0.0177, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4394366197183099, | |
| "grad_norm": 1.7622896432876587, | |
| "learning_rate": 9.085245901639344e-06, | |
| "loss": 0.0191, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4450704225352113, | |
| "grad_norm": 1.080950140953064, | |
| "learning_rate": 9.052459016393443e-06, | |
| "loss": 0.0136, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4507042253521127, | |
| "grad_norm": 2.1891305446624756, | |
| "learning_rate": 9.019672131147542e-06, | |
| "loss": 0.0151, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4563380281690141, | |
| "grad_norm": 2.1063690185546875, | |
| "learning_rate": 8.98688524590164e-06, | |
| "loss": 0.0175, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4619718309859155, | |
| "grad_norm": 1.6590553522109985, | |
| "learning_rate": 8.954098360655739e-06, | |
| "loss": 0.0176, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4676056338028169, | |
| "grad_norm": 1.7408928871154785, | |
| "learning_rate": 8.921311475409838e-06, | |
| "loss": 0.0141, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4732394366197183, | |
| "grad_norm": 3.2451632022857666, | |
| "learning_rate": 8.888524590163935e-06, | |
| "loss": 0.0187, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4788732394366197, | |
| "grad_norm": 2.8567867279052734, | |
| "learning_rate": 8.855737704918034e-06, | |
| "loss": 0.0285, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.48450704225352115, | |
| "grad_norm": 1.448118805885315, | |
| "learning_rate": 8.822950819672131e-06, | |
| "loss": 0.0118, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.49014084507042255, | |
| "grad_norm": 1.4737434387207031, | |
| "learning_rate": 8.79016393442623e-06, | |
| "loss": 0.0149, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.49577464788732395, | |
| "grad_norm": 3.3012146949768066, | |
| "learning_rate": 8.75737704918033e-06, | |
| "loss": 0.0182, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5014084507042254, | |
| "grad_norm": 1.027005910873413, | |
| "learning_rate": 8.724590163934427e-06, | |
| "loss": 0.0148, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5070422535211268, | |
| "grad_norm": 2.394969940185547, | |
| "learning_rate": 8.691803278688526e-06, | |
| "loss": 0.0182, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5126760563380282, | |
| "grad_norm": 1.4397178888320923, | |
| "learning_rate": 8.659016393442625e-06, | |
| "loss": 0.0206, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5183098591549296, | |
| "grad_norm": 2.706894636154175, | |
| "learning_rate": 8.626229508196722e-06, | |
| "loss": 0.0209, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.523943661971831, | |
| "grad_norm": 3.6924495697021484, | |
| "learning_rate": 8.593442622950821e-06, | |
| "loss": 0.0231, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5295774647887324, | |
| "grad_norm": 2.320352554321289, | |
| "learning_rate": 8.560655737704918e-06, | |
| "loss": 0.0229, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5352112676056338, | |
| "grad_norm": 2.570531129837036, | |
| "learning_rate": 8.527868852459018e-06, | |
| "loss": 0.0157, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5408450704225352, | |
| "grad_norm": 2.1590864658355713, | |
| "learning_rate": 8.495081967213117e-06, | |
| "loss": 0.0162, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5464788732394367, | |
| "grad_norm": 1.717673659324646, | |
| "learning_rate": 8.462295081967214e-06, | |
| "loss": 0.019, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5521126760563381, | |
| "grad_norm": 1.1120901107788086, | |
| "learning_rate": 8.429508196721313e-06, | |
| "loss": 0.012, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5577464788732395, | |
| "grad_norm": 2.0220296382904053, | |
| "learning_rate": 8.39672131147541e-06, | |
| "loss": 0.0195, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5633802816901409, | |
| "grad_norm": 3.2409744262695312, | |
| "learning_rate": 8.36393442622951e-06, | |
| "loss": 0.0196, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5690140845070423, | |
| "grad_norm": 1.8710452318191528, | |
| "learning_rate": 8.331147540983607e-06, | |
| "loss": 0.0216, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5746478873239437, | |
| "grad_norm": 1.902570366859436, | |
| "learning_rate": 8.298360655737706e-06, | |
| "loss": 0.0121, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5802816901408451, | |
| "grad_norm": 2.0836708545684814, | |
| "learning_rate": 8.265573770491803e-06, | |
| "loss": 0.0207, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5859154929577465, | |
| "grad_norm": 0.8831673264503479, | |
| "learning_rate": 8.232786885245902e-06, | |
| "loss": 0.0189, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5915492957746479, | |
| "grad_norm": 1.7548481225967407, | |
| "learning_rate": 8.2e-06, | |
| "loss": 0.0197, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5971830985915493, | |
| "grad_norm": 1.3168854713439941, | |
| "learning_rate": 8.167213114754098e-06, | |
| "loss": 0.0149, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6028169014084507, | |
| "grad_norm": 1.703010082244873, | |
| "learning_rate": 8.134426229508197e-06, | |
| "loss": 0.0184, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6084507042253521, | |
| "grad_norm": 1.3345087766647339, | |
| "learning_rate": 8.101639344262295e-06, | |
| "loss": 0.0168, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6140845070422535, | |
| "grad_norm": 1.4866513013839722, | |
| "learning_rate": 8.068852459016394e-06, | |
| "loss": 0.0159, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6197183098591549, | |
| "grad_norm": 0.8655949234962463, | |
| "learning_rate": 8.036065573770493e-06, | |
| "loss": 0.0227, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6253521126760564, | |
| "grad_norm": 1.8502163887023926, | |
| "learning_rate": 8.00327868852459e-06, | |
| "loss": 0.0213, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6309859154929578, | |
| "grad_norm": 2.3221960067749023, | |
| "learning_rate": 7.97049180327869e-06, | |
| "loss": 0.0168, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6366197183098592, | |
| "grad_norm": 0.6024265289306641, | |
| "learning_rate": 7.937704918032788e-06, | |
| "loss": 0.0119, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6422535211267606, | |
| "grad_norm": 1.0888503789901733, | |
| "learning_rate": 7.904918032786886e-06, | |
| "loss": 0.018, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.647887323943662, | |
| "grad_norm": 1.887101173400879, | |
| "learning_rate": 7.872131147540985e-06, | |
| "loss": 0.0174, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6535211267605634, | |
| "grad_norm": 1.6141228675842285, | |
| "learning_rate": 7.839344262295082e-06, | |
| "loss": 0.019, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6591549295774648, | |
| "grad_norm": 0.9001067876815796, | |
| "learning_rate": 7.806557377049181e-06, | |
| "loss": 0.0145, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6647887323943662, | |
| "grad_norm": 0.6219407320022583, | |
| "learning_rate": 7.77377049180328e-06, | |
| "loss": 0.0167, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6704225352112676, | |
| "grad_norm": 1.1784660816192627, | |
| "learning_rate": 7.740983606557377e-06, | |
| "loss": 0.0152, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.676056338028169, | |
| "grad_norm": 1.0849504470825195, | |
| "learning_rate": 7.708196721311476e-06, | |
| "loss": 0.0181, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6816901408450704, | |
| "grad_norm": 1.2452244758605957, | |
| "learning_rate": 7.675409836065576e-06, | |
| "loss": 0.0166, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6873239436619718, | |
| "grad_norm": 1.5875732898712158, | |
| "learning_rate": 7.642622950819673e-06, | |
| "loss": 0.0161, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6929577464788732, | |
| "grad_norm": 1.530847430229187, | |
| "learning_rate": 7.609836065573771e-06, | |
| "loss": 0.0201, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6985915492957746, | |
| "grad_norm": 2.098750114440918, | |
| "learning_rate": 7.577049180327869e-06, | |
| "loss": 0.0161, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.704225352112676, | |
| "grad_norm": 0.9777464270591736, | |
| "learning_rate": 7.5442622950819674e-06, | |
| "loss": 0.0149, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7098591549295775, | |
| "grad_norm": 1.5550135374069214, | |
| "learning_rate": 7.5114754098360665e-06, | |
| "loss": 0.0159, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7154929577464789, | |
| "grad_norm": 1.6158477067947388, | |
| "learning_rate": 7.478688524590164e-06, | |
| "loss": 0.0143, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7211267605633803, | |
| "grad_norm": 1.2116320133209229, | |
| "learning_rate": 7.445901639344263e-06, | |
| "loss": 0.014, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7267605633802817, | |
| "grad_norm": 2.0365331172943115, | |
| "learning_rate": 7.413114754098362e-06, | |
| "loss": 0.0231, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7323943661971831, | |
| "grad_norm": 2.1054487228393555, | |
| "learning_rate": 7.380327868852459e-06, | |
| "loss": 0.0256, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7380281690140845, | |
| "grad_norm": 1.5217443704605103, | |
| "learning_rate": 7.347540983606558e-06, | |
| "loss": 0.017, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7436619718309859, | |
| "grad_norm": 2.1168699264526367, | |
| "learning_rate": 7.314754098360657e-06, | |
| "loss": 0.0132, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7492957746478873, | |
| "grad_norm": 2.137946844100952, | |
| "learning_rate": 7.281967213114755e-06, | |
| "loss": 0.0169, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7549295774647887, | |
| "grad_norm": 3.1900794506073, | |
| "learning_rate": 7.249180327868854e-06, | |
| "loss": 0.0155, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7605633802816901, | |
| "grad_norm": 1.0735390186309814, | |
| "learning_rate": 7.216393442622951e-06, | |
| "loss": 0.0213, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7661971830985915, | |
| "grad_norm": 2.0629560947418213, | |
| "learning_rate": 7.18360655737705e-06, | |
| "loss": 0.0184, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7718309859154929, | |
| "grad_norm": 1.6908258199691772, | |
| "learning_rate": 7.150819672131148e-06, | |
| "loss": 0.0206, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7774647887323943, | |
| "grad_norm": 1.9119372367858887, | |
| "learning_rate": 7.118032786885246e-06, | |
| "loss": 0.0156, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7830985915492957, | |
| "grad_norm": 2.6538517475128174, | |
| "learning_rate": 7.085245901639345e-06, | |
| "loss": 0.0218, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7887323943661971, | |
| "grad_norm": 1.286156177520752, | |
| "learning_rate": 7.052459016393444e-06, | |
| "loss": 0.0138, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7943661971830986, | |
| "grad_norm": 2.077173948287964, | |
| "learning_rate": 7.019672131147541e-06, | |
| "loss": 0.0214, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7982816696166992, | |
| "learning_rate": 6.98688524590164e-06, | |
| "loss": 0.0188, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8056338028169014, | |
| "grad_norm": 1.7048382759094238, | |
| "learning_rate": 6.954098360655737e-06, | |
| "loss": 0.0131, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8112676056338028, | |
| "grad_norm": 1.4576507806777954, | |
| "learning_rate": 6.921311475409836e-06, | |
| "loss": 0.0147, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8169014084507042, | |
| "grad_norm": 1.2167617082595825, | |
| "learning_rate": 6.8885245901639354e-06, | |
| "loss": 0.0158, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8225352112676056, | |
| "grad_norm": 2.0346646308898926, | |
| "learning_rate": 6.855737704918033e-06, | |
| "loss": 0.0158, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.828169014084507, | |
| "grad_norm": 1.2361705303192139, | |
| "learning_rate": 6.822950819672132e-06, | |
| "loss": 0.0101, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8338028169014085, | |
| "grad_norm": 1.916527271270752, | |
| "learning_rate": 6.790163934426231e-06, | |
| "loss": 0.0147, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8394366197183099, | |
| "grad_norm": 2.0439751148223877, | |
| "learning_rate": 6.757377049180328e-06, | |
| "loss": 0.0172, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8450704225352113, | |
| "grad_norm": 1.4808157682418823, | |
| "learning_rate": 6.724590163934427e-06, | |
| "loss": 0.0144, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8507042253521127, | |
| "grad_norm": 2.236619234085083, | |
| "learning_rate": 6.691803278688525e-06, | |
| "loss": 0.0124, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.856338028169014, | |
| "grad_norm": 1.0215883255004883, | |
| "learning_rate": 6.659016393442624e-06, | |
| "loss": 0.019, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8619718309859155, | |
| "grad_norm": 2.6691057682037354, | |
| "learning_rate": 6.626229508196722e-06, | |
| "loss": 0.0251, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8676056338028169, | |
| "grad_norm": 1.466689944267273, | |
| "learning_rate": 6.59344262295082e-06, | |
| "loss": 0.0142, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8732394366197183, | |
| "grad_norm": 1.797670841217041, | |
| "learning_rate": 6.560655737704918e-06, | |
| "loss": 0.0164, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8788732394366198, | |
| "grad_norm": 2.59198260307312, | |
| "learning_rate": 6.527868852459017e-06, | |
| "loss": 0.0252, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8845070422535212, | |
| "grad_norm": 2.8690240383148193, | |
| "learning_rate": 6.4950819672131145e-06, | |
| "loss": 0.0185, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8901408450704226, | |
| "grad_norm": 1.5174709558486938, | |
| "learning_rate": 6.4622950819672136e-06, | |
| "loss": 0.0197, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.895774647887324, | |
| "grad_norm": 0.8220402598381042, | |
| "learning_rate": 6.429508196721313e-06, | |
| "loss": 0.0186, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9014084507042254, | |
| "grad_norm": 1.6295188665390015, | |
| "learning_rate": 6.39672131147541e-06, | |
| "loss": 0.0183, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9070422535211268, | |
| "grad_norm": 1.0216023921966553, | |
| "learning_rate": 6.363934426229509e-06, | |
| "loss": 0.0149, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9126760563380282, | |
| "grad_norm": 1.4068635702133179, | |
| "learning_rate": 6.331147540983608e-06, | |
| "loss": 0.0113, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9183098591549296, | |
| "grad_norm": 1.1198160648345947, | |
| "learning_rate": 6.298360655737705e-06, | |
| "loss": 0.0123, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.923943661971831, | |
| "grad_norm": 1.9338263273239136, | |
| "learning_rate": 6.265573770491804e-06, | |
| "loss": 0.0133, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9295774647887324, | |
| "grad_norm": 1.9467333555221558, | |
| "learning_rate": 6.232786885245902e-06, | |
| "loss": 0.0168, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9352112676056338, | |
| "grad_norm": 5.4310712814331055, | |
| "learning_rate": 6.200000000000001e-06, | |
| "loss": 0.0183, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9408450704225352, | |
| "grad_norm": 7.56189489364624, | |
| "learning_rate": 6.167213114754099e-06, | |
| "loss": 0.0142, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9464788732394366, | |
| "grad_norm": 1.2114770412445068, | |
| "learning_rate": 6.134426229508197e-06, | |
| "loss": 0.0153, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.952112676056338, | |
| "grad_norm": 0.7653253078460693, | |
| "learning_rate": 6.101639344262295e-06, | |
| "loss": 0.0214, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9577464788732394, | |
| "grad_norm": 1.0881645679473877, | |
| "learning_rate": 6.068852459016394e-06, | |
| "loss": 0.0165, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9633802816901409, | |
| "grad_norm": 1.2689759731292725, | |
| "learning_rate": 6.036065573770492e-06, | |
| "loss": 0.0129, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9690140845070423, | |
| "grad_norm": 1.2382926940917969, | |
| "learning_rate": 6.003278688524591e-06, | |
| "loss": 0.013, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9746478873239437, | |
| "grad_norm": 1.3045654296875, | |
| "learning_rate": 5.970491803278688e-06, | |
| "loss": 0.0149, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9802816901408451, | |
| "grad_norm": 1.588781476020813, | |
| "learning_rate": 5.937704918032787e-06, | |
| "loss": 0.0151, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9859154929577465, | |
| "grad_norm": 0.9168156981468201, | |
| "learning_rate": 5.904918032786886e-06, | |
| "loss": 0.0163, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9915492957746479, | |
| "grad_norm": 1.5272772312164307, | |
| "learning_rate": 5.8721311475409835e-06, | |
| "loss": 0.0127, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9971830985915493, | |
| "grad_norm": 2.0196149349212646, | |
| "learning_rate": 5.8393442622950825e-06, | |
| "loss": 0.0167, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.0028169014084507, | |
| "grad_norm": 1.132421612739563, | |
| "learning_rate": 5.806557377049182e-06, | |
| "loss": 0.0111, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.008450704225352, | |
| "grad_norm": 0.7580140233039856, | |
| "learning_rate": 5.773770491803279e-06, | |
| "loss": 0.0042, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.0140845070422535, | |
| "grad_norm": 0.7441082000732422, | |
| "learning_rate": 5.740983606557378e-06, | |
| "loss": 0.0055, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.019718309859155, | |
| "grad_norm": 0.689598798751831, | |
| "learning_rate": 5.708196721311476e-06, | |
| "loss": 0.0067, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.0253521126760563, | |
| "grad_norm": 1.3248918056488037, | |
| "learning_rate": 5.675409836065574e-06, | |
| "loss": 0.0071, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.0309859154929577, | |
| "grad_norm": 0.6853448152542114, | |
| "learning_rate": 5.6426229508196725e-06, | |
| "loss": 0.0043, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.036619718309859, | |
| "grad_norm": 0.718917191028595, | |
| "learning_rate": 5.609836065573771e-06, | |
| "loss": 0.0047, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.0422535211267605, | |
| "grad_norm": 0.6834334135055542, | |
| "learning_rate": 5.577049180327869e-06, | |
| "loss": 0.0047, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.047887323943662, | |
| "grad_norm": 0.4496719539165497, | |
| "learning_rate": 5.544262295081968e-06, | |
| "loss": 0.0045, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.0535211267605633, | |
| "grad_norm": 0.11652589589357376, | |
| "learning_rate": 5.511475409836065e-06, | |
| "loss": 0.0081, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.0591549295774647, | |
| "grad_norm": 0.28196972608566284, | |
| "learning_rate": 5.478688524590164e-06, | |
| "loss": 0.0064, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.064788732394366, | |
| "grad_norm": 0.2863517701625824, | |
| "learning_rate": 5.445901639344263e-06, | |
| "loss": 0.0066, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.0704225352112675, | |
| "grad_norm": 0.3156941831111908, | |
| "learning_rate": 5.413114754098361e-06, | |
| "loss": 0.004, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.076056338028169, | |
| "grad_norm": 1.2497857809066772, | |
| "learning_rate": 5.38032786885246e-06, | |
| "loss": 0.0082, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.0816901408450703, | |
| "grad_norm": 0.25184062123298645, | |
| "learning_rate": 5.347540983606557e-06, | |
| "loss": 0.0061, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.0873239436619717, | |
| "grad_norm": 0.2447606921195984, | |
| "learning_rate": 5.314754098360656e-06, | |
| "loss": 0.0035, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.0929577464788733, | |
| "grad_norm": 1.0463167428970337, | |
| "learning_rate": 5.281967213114755e-06, | |
| "loss": 0.0069, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.0985915492957747, | |
| "grad_norm": 0.7045953273773193, | |
| "learning_rate": 5.2491803278688525e-06, | |
| "loss": 0.0091, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.1042253521126761, | |
| "grad_norm": 2.306065559387207, | |
| "learning_rate": 5.2163934426229515e-06, | |
| "loss": 0.0063, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.1098591549295775, | |
| "grad_norm": 0.23815812170505524, | |
| "learning_rate": 5.18360655737705e-06, | |
| "loss": 0.007, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.115492957746479, | |
| "grad_norm": 0.8824997544288635, | |
| "learning_rate": 5.150819672131148e-06, | |
| "loss": 0.0035, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.1211267605633803, | |
| "grad_norm": 1.441972017288208, | |
| "learning_rate": 5.118032786885246e-06, | |
| "loss": 0.0046, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.1267605633802817, | |
| "grad_norm": 0.5692065954208374, | |
| "learning_rate": 5.085245901639345e-06, | |
| "loss": 0.0034, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1267605633802817, | |
| "eval_loss": 0.006601665634661913, | |
| "eval_runtime": 10204.8454, | |
| "eval_samples_per_second": 1.391, | |
| "eval_steps_per_second": 0.174, | |
| "eval_wer": 0.6022624256040361, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3550, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.531667287146496e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |