{ "best_global_step": 810, "best_metric": 0.7329609990119934, "best_model_checkpoint": "./nepal-legal-model/checkpoint-810", "epoch": 1.0, "eval_steps": 500, "global_step": 810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012345679012345678, "grad_norm": 2.1483232975006104, "learning_rate": 2.4657534246575342e-05, "loss": 1.8579, "step": 10 }, { "epoch": 0.024691358024691357, "grad_norm": 2.067028045654297, "learning_rate": 5.2054794520547945e-05, "loss": 1.4274, "step": 20 }, { "epoch": 0.037037037037037035, "grad_norm": 1.7251120805740356, "learning_rate": 7.945205479452055e-05, "loss": 1.0776, "step": 30 }, { "epoch": 0.04938271604938271, "grad_norm": 1.4287426471710205, "learning_rate": 0.00010684931506849317, "loss": 1.0135, "step": 40 }, { "epoch": 0.06172839506172839, "grad_norm": 1.3038461208343506, "learning_rate": 0.00013424657534246576, "loss": 0.9539, "step": 50 }, { "epoch": 0.07407407407407407, "grad_norm": 1.2418831586837769, "learning_rate": 0.00016164383561643837, "loss": 0.9208, "step": 60 }, { "epoch": 0.08641975308641975, "grad_norm": 1.1602648496627808, "learning_rate": 0.00018904109589041096, "loss": 0.9139, "step": 70 }, { "epoch": 0.09876543209876543, "grad_norm": 1.1099259853363037, "learning_rate": 0.0001999968022038833, "loss": 0.9438, "step": 80 }, { "epoch": 0.1111111111111111, "grad_norm": 1.0900254249572754, "learning_rate": 0.0001999772608571399, "loss": 0.8994, "step": 90 }, { "epoch": 0.12345679012345678, "grad_norm": 1.1199110746383667, "learning_rate": 0.0001999399581844347, "loss": 0.8972, "step": 100 }, { "epoch": 0.13580246913580246, "grad_norm": 1.1685237884521484, "learning_rate": 0.00019988490081272397, "loss": 0.8753, "step": 110 }, { "epoch": 0.14814814814814814, "grad_norm": 1.0480120182037354, "learning_rate": 0.0001998120985231511, "loss": 0.8805, "step": 120 }, { "epoch": 0.16049382716049382, "grad_norm": 1.0806885957717896, "learning_rate": 0.00019972156424930896, "loss": 0.8553, "step": 130 }, { "epoch": 0.1728395061728395, "grad_norm": 1.1102285385131836, "learning_rate": 0.00019961331407494245, "loss": 0.864, "step": 140 }, { "epoch": 0.18518518518518517, "grad_norm": 1.0785351991653442, "learning_rate": 0.00019948736723109082, "loss": 0.8686, "step": 150 }, { "epoch": 0.19753086419753085, "grad_norm": 1.0402244329452515, "learning_rate": 0.00019934374609267136, "loss": 0.827, "step": 160 }, { "epoch": 0.20987654320987653, "grad_norm": 1.1072781085968018, "learning_rate": 0.00019918247617450454, "loss": 0.8345, "step": 170 }, { "epoch": 0.2222222222222222, "grad_norm": 1.0837725400924683, "learning_rate": 0.00019900358612678099, "loss": 0.8745, "step": 180 }, { "epoch": 0.2345679012345679, "grad_norm": 1.0628209114074707, "learning_rate": 0.0001988071077299718, "loss": 0.8584, "step": 190 }, { "epoch": 0.24691358024691357, "grad_norm": 1.0645418167114258, "learning_rate": 0.00019859307588918258, "loss": 0.8579, "step": 200 }, { "epoch": 0.25925925925925924, "grad_norm": 1.140362024307251, "learning_rate": 0.00019836152862795245, "loss": 0.8534, "step": 210 }, { "epoch": 0.2716049382716049, "grad_norm": 1.072213053703308, "learning_rate": 0.0001981125070814991, "loss": 0.8391, "step": 220 }, { "epoch": 0.2839506172839506, "grad_norm": 1.0480926036834717, "learning_rate": 0.00019784605548941073, "loss": 0.8562, "step": 230 }, { "epoch": 0.2962962962962963, "grad_norm": 1.0673149824142456, "learning_rate": 0.00019756222118778706, "loss": 0.8616, "step": 240 }, { "epoch": 0.30864197530864196, "grad_norm": 1.061468482017517, "learning_rate": 0.0001972610546008295, "loss": 0.8103, "step": 250 }, { "epoch": 0.32098765432098764, "grad_norm": 1.1202861070632935, "learning_rate": 0.00019694260923188356, "loss": 0.8037, "step": 260 }, { "epoch": 0.3333333333333333, "grad_norm": 1.0481396913528442, "learning_rate": 0.00019660694165393334, "loss": 0.8497, "step": 270 }, { "epoch": 0.345679012345679, "grad_norm": 1.0048563480377197, "learning_rate": 0.00019625411149955154, "loss": 0.8188, "step": 280 }, { "epoch": 0.35802469135802467, "grad_norm": 1.1558786630630493, "learning_rate": 0.00019588418145030518, "loss": 0.7984, "step": 290 }, { "epoch": 0.37037037037037035, "grad_norm": 1.1208077669143677, "learning_rate": 0.00019549721722562018, "loss": 0.8265, "step": 300 }, { "epoch": 0.38271604938271603, "grad_norm": 1.1445388793945312, "learning_rate": 0.00019509328757110598, "loss": 0.822, "step": 310 }, { "epoch": 0.3950617283950617, "grad_norm": 0.9763692617416382, "learning_rate": 0.0001946724642463427, "loss": 0.7717, "step": 320 }, { "epoch": 0.4074074074074074, "grad_norm": 1.1003857851028442, "learning_rate": 0.00019423482201213276, "loss": 0.792, "step": 330 }, { "epoch": 0.41975308641975306, "grad_norm": 0.9452071189880371, "learning_rate": 0.0001937804386172193, "loss": 0.812, "step": 340 }, { "epoch": 0.43209876543209874, "grad_norm": 1.1163841485977173, "learning_rate": 0.00019330939478447393, "loss": 0.7962, "step": 350 }, { "epoch": 0.4444444444444444, "grad_norm": 1.0678188800811768, "learning_rate": 0.00019282177419655585, "loss": 0.7824, "step": 360 }, { "epoch": 0.4567901234567901, "grad_norm": 0.9730695486068726, "learning_rate": 0.00019231766348104556, "loss": 0.7681, "step": 370 }, { "epoch": 0.4691358024691358, "grad_norm": 1.1495038270950317, "learning_rate": 0.000191797152195055, "loss": 0.7846, "step": 380 }, { "epoch": 0.48148148148148145, "grad_norm": 1.0246994495391846, "learning_rate": 0.00019126033280931733, "loss": 0.7905, "step": 390 }, { "epoch": 0.49382716049382713, "grad_norm": 1.1239315271377563, "learning_rate": 0.00019070730069175936, "loss": 0.8343, "step": 400 }, { "epoch": 0.5061728395061729, "grad_norm": 1.1794980764389038, "learning_rate": 0.00019013815409055896, "loss": 0.7987, "step": 410 }, { "epoch": 0.5185185185185185, "grad_norm": 1.16999351978302, "learning_rate": 0.0001895529941166909, "loss": 0.7939, "step": 420 }, { "epoch": 0.5308641975308642, "grad_norm": 1.077894926071167, "learning_rate": 0.00018895192472596426, "loss": 0.7979, "step": 430 }, { "epoch": 0.5432098765432098, "grad_norm": 1.169101595878601, "learning_rate": 0.0001883350527005541, "loss": 0.7817, "step": 440 }, { "epoch": 0.5555555555555556, "grad_norm": 1.211340069770813, "learning_rate": 0.00018770248763003134, "loss": 0.78, "step": 450 }, { "epoch": 0.5679012345679012, "grad_norm": 1.0432859659194946, "learning_rate": 0.00018705434189189376, "loss": 0.8006, "step": 460 }, { "epoch": 0.5802469135802469, "grad_norm": 1.0830694437026978, "learning_rate": 0.00018639073063160172, "loss": 0.7811, "step": 470 }, { "epoch": 0.5925925925925926, "grad_norm": 1.0481195449829102, "learning_rate": 0.00018571177174212214, "loss": 0.7438, "step": 480 }, { "epoch": 0.6049382716049383, "grad_norm": 1.0968844890594482, "learning_rate": 0.00018501758584298433, "loss": 0.7656, "step": 490 }, { "epoch": 0.6172839506172839, "grad_norm": 1.000929832458496, "learning_rate": 0.00018430829625885165, "loss": 0.789, "step": 500 }, { "epoch": 0.6296296296296297, "grad_norm": 1.0056663751602173, "learning_rate": 0.00018358402899761218, "loss": 0.7727, "step": 510 }, { "epoch": 0.6419753086419753, "grad_norm": 1.0271273851394653, "learning_rate": 0.00018284491272799327, "loss": 0.7723, "step": 520 }, { "epoch": 0.654320987654321, "grad_norm": 1.0579137802124023, "learning_rate": 0.00018209107875670277, "loss": 0.7818, "step": 530 }, { "epoch": 0.6666666666666666, "grad_norm": 1.2076494693756104, "learning_rate": 0.00018132266100510214, "loss": 0.7709, "step": 540 }, { "epoch": 0.6790123456790124, "grad_norm": 1.0516587495803833, "learning_rate": 0.0001805397959854147, "loss": 0.7594, "step": 550 }, { "epoch": 0.691358024691358, "grad_norm": 1.0052849054336548, "learning_rate": 0.00017974262277647374, "loss": 0.7645, "step": 560 }, { "epoch": 0.7037037037037037, "grad_norm": 1.067668080329895, "learning_rate": 0.00017893128299901472, "loss": 0.7508, "step": 570 }, { "epoch": 0.7160493827160493, "grad_norm": 1.1277140378952026, "learning_rate": 0.00017810592079051585, "loss": 0.7878, "step": 580 }, { "epoch": 0.7283950617283951, "grad_norm": 1.1608866453170776, "learning_rate": 0.00017726668277959136, "loss": 0.7638, "step": 590 }, { "epoch": 0.7407407407407407, "grad_norm": 1.102867603302002, "learning_rate": 0.00017641371805994264, "loss": 0.7618, "step": 600 }, { "epoch": 0.7530864197530864, "grad_norm": 1.1261731386184692, "learning_rate": 0.00017554717816387107, "loss": 0.758, "step": 610 }, { "epoch": 0.7654320987654321, "grad_norm": 1.066041111946106, "learning_rate": 0.00017466721703535764, "loss": 0.7478, "step": 620 }, { "epoch": 0.7777777777777778, "grad_norm": 1.021802544593811, "learning_rate": 0.0001737739910027145, "loss": 0.7554, "step": 630 }, { "epoch": 0.7901234567901234, "grad_norm": 1.1736857891082764, "learning_rate": 0.00017286765875081244, "loss": 0.78, "step": 640 }, { "epoch": 0.8024691358024691, "grad_norm": 1.1246825456619263, "learning_rate": 0.00017194838129289006, "loss": 0.7475, "step": 650 }, { "epoch": 0.8148148148148148, "grad_norm": 1.1927721500396729, "learning_rate": 0.0001710163219419491, "loss": 0.742, "step": 660 }, { "epoch": 0.8271604938271605, "grad_norm": 1.0868706703186035, "learning_rate": 0.00017007164628174139, "loss": 0.7676, "step": 670 }, { "epoch": 0.8395061728395061, "grad_norm": 1.0195647478103638, "learning_rate": 0.00016911452213735223, "loss": 0.7543, "step": 680 }, { "epoch": 0.8518518518518519, "grad_norm": 1.1533241271972656, "learning_rate": 0.00016814511954538558, "loss": 0.7463, "step": 690 }, { "epoch": 0.8641975308641975, "grad_norm": 1.1366993188858032, "learning_rate": 0.00016716361072375657, "loss": 0.7407, "step": 700 }, { "epoch": 0.8765432098765432, "grad_norm": 1.2098641395568848, "learning_rate": 0.00016617017004109632, "loss": 0.7031, "step": 710 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9996205568313599, "learning_rate": 0.0001651649739857746, "loss": 0.7405, "step": 720 }, { "epoch": 0.9012345679012346, "grad_norm": 1.046208143234253, "learning_rate": 0.00016414820113454622, "loss": 0.763, "step": 730 }, { "epoch": 0.9135802469135802, "grad_norm": 1.0813542604446411, "learning_rate": 0.0001631200321208261, "loss": 0.7432, "step": 740 }, { "epoch": 0.9259259259259259, "grad_norm": 1.1473093032836914, "learning_rate": 0.00016208064960259897, "loss": 0.7738, "step": 750 }, { "epoch": 0.9382716049382716, "grad_norm": 1.084929347038269, "learning_rate": 0.00016103023822996982, "loss": 0.764, "step": 760 }, { "epoch": 0.9506172839506173, "grad_norm": 1.0776119232177734, "learning_rate": 0.00015996898461235977, "loss": 0.7496, "step": 770 }, { "epoch": 0.9629629629629629, "grad_norm": 1.1536927223205566, "learning_rate": 0.00015889707728535462, "loss": 0.7501, "step": 780 }, { "epoch": 0.9753086419753086, "grad_norm": 1.211068868637085, "learning_rate": 0.0001578147066772104, "loss": 0.7623, "step": 790 }, { "epoch": 0.9876543209876543, "grad_norm": 1.1335115432739258, "learning_rate": 0.00015672206507502337, "loss": 0.7261, "step": 800 }, { "epoch": 1.0, "grad_norm": 1.1170907020568848, "learning_rate": 0.00015561934659056947, "loss": 0.732, "step": 810 }, { "epoch": 1.0, "eval_loss": 0.7329609990119934, "eval_runtime": 487.7516, "eval_samples_per_second": 2.952, "eval_steps_per_second": 0.738, "step": 810 } ], "logging_steps": 10, "max_steps": 2430, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4237273708455526e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }