| { | |
| "best_global_step": 810, | |
| "best_metric": 0.7329609990119934, | |
| "best_model_checkpoint": "./nepal-legal-model/checkpoint-810", | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 810, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012345679012345678, | |
| "grad_norm": 2.1483232975006104, | |
| "learning_rate": 2.4657534246575342e-05, | |
| "loss": 1.8579, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024691358024691357, | |
| "grad_norm": 2.067028045654297, | |
| "learning_rate": 5.2054794520547945e-05, | |
| "loss": 1.4274, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.037037037037037035, | |
| "grad_norm": 1.7251120805740356, | |
| "learning_rate": 7.945205479452055e-05, | |
| "loss": 1.0776, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04938271604938271, | |
| "grad_norm": 1.4287426471710205, | |
| "learning_rate": 0.00010684931506849317, | |
| "loss": 1.0135, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06172839506172839, | |
| "grad_norm": 1.3038461208343506, | |
| "learning_rate": 0.00013424657534246576, | |
| "loss": 0.9539, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07407407407407407, | |
| "grad_norm": 1.2418831586837769, | |
| "learning_rate": 0.00016164383561643837, | |
| "loss": 0.9208, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08641975308641975, | |
| "grad_norm": 1.1602648496627808, | |
| "learning_rate": 0.00018904109589041096, | |
| "loss": 0.9139, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09876543209876543, | |
| "grad_norm": 1.1099259853363037, | |
| "learning_rate": 0.0001999968022038833, | |
| "loss": 0.9438, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 1.0900254249572754, | |
| "learning_rate": 0.0001999772608571399, | |
| "loss": 0.8994, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12345679012345678, | |
| "grad_norm": 1.1199110746383667, | |
| "learning_rate": 0.0001999399581844347, | |
| "loss": 0.8972, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13580246913580246, | |
| "grad_norm": 1.1685237884521484, | |
| "learning_rate": 0.00019988490081272397, | |
| "loss": 0.8753, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 1.0480120182037354, | |
| "learning_rate": 0.0001998120985231511, | |
| "loss": 0.8805, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16049382716049382, | |
| "grad_norm": 1.0806885957717896, | |
| "learning_rate": 0.00019972156424930896, | |
| "loss": 0.8553, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1728395061728395, | |
| "grad_norm": 1.1102285385131836, | |
| "learning_rate": 0.00019961331407494245, | |
| "loss": 0.864, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18518518518518517, | |
| "grad_norm": 1.0785351991653442, | |
| "learning_rate": 0.00019948736723109082, | |
| "loss": 0.8686, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19753086419753085, | |
| "grad_norm": 1.0402244329452515, | |
| "learning_rate": 0.00019934374609267136, | |
| "loss": 0.827, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20987654320987653, | |
| "grad_norm": 1.1072781085968018, | |
| "learning_rate": 0.00019918247617450454, | |
| "loss": 0.8345, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 1.0837725400924683, | |
| "learning_rate": 0.00019900358612678099, | |
| "loss": 0.8745, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2345679012345679, | |
| "grad_norm": 1.0628209114074707, | |
| "learning_rate": 0.0001988071077299718, | |
| "loss": 0.8584, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24691358024691357, | |
| "grad_norm": 1.0645418167114258, | |
| "learning_rate": 0.00019859307588918258, | |
| "loss": 0.8579, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.25925925925925924, | |
| "grad_norm": 1.140362024307251, | |
| "learning_rate": 0.00019836152862795245, | |
| "loss": 0.8534, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2716049382716049, | |
| "grad_norm": 1.072213053703308, | |
| "learning_rate": 0.0001981125070814991, | |
| "loss": 0.8391, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2839506172839506, | |
| "grad_norm": 1.0480926036834717, | |
| "learning_rate": 0.00019784605548941073, | |
| "loss": 0.8562, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 1.0673149824142456, | |
| "learning_rate": 0.00019756222118778706, | |
| "loss": 0.8616, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30864197530864196, | |
| "grad_norm": 1.061468482017517, | |
| "learning_rate": 0.0001972610546008295, | |
| "loss": 0.8103, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.32098765432098764, | |
| "grad_norm": 1.1202861070632935, | |
| "learning_rate": 0.00019694260923188356, | |
| "loss": 0.8037, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 1.0481396913528442, | |
| "learning_rate": 0.00019660694165393334, | |
| "loss": 0.8497, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.345679012345679, | |
| "grad_norm": 1.0048563480377197, | |
| "learning_rate": 0.00019625411149955154, | |
| "loss": 0.8188, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.35802469135802467, | |
| "grad_norm": 1.1558786630630493, | |
| "learning_rate": 0.00019588418145030518, | |
| "loss": 0.7984, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 1.1208077669143677, | |
| "learning_rate": 0.00019549721722562018, | |
| "loss": 0.8265, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.38271604938271603, | |
| "grad_norm": 1.1445388793945312, | |
| "learning_rate": 0.00019509328757110598, | |
| "loss": 0.822, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3950617283950617, | |
| "grad_norm": 0.9763692617416382, | |
| "learning_rate": 0.0001946724642463427, | |
| "loss": 0.7717, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4074074074074074, | |
| "grad_norm": 1.1003857851028442, | |
| "learning_rate": 0.00019423482201213276, | |
| "loss": 0.792, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.41975308641975306, | |
| "grad_norm": 0.9452071189880371, | |
| "learning_rate": 0.0001937804386172193, | |
| "loss": 0.812, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.43209876543209874, | |
| "grad_norm": 1.1163841485977173, | |
| "learning_rate": 0.00019330939478447393, | |
| "loss": 0.7962, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 1.0678188800811768, | |
| "learning_rate": 0.00019282177419655585, | |
| "loss": 0.7824, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4567901234567901, | |
| "grad_norm": 0.9730695486068726, | |
| "learning_rate": 0.00019231766348104556, | |
| "loss": 0.7681, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4691358024691358, | |
| "grad_norm": 1.1495038270950317, | |
| "learning_rate": 0.000191797152195055, | |
| "loss": 0.7846, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.48148148148148145, | |
| "grad_norm": 1.0246994495391846, | |
| "learning_rate": 0.00019126033280931733, | |
| "loss": 0.7905, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 1.1239315271377563, | |
| "learning_rate": 0.00019070730069175936, | |
| "loss": 0.8343, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5061728395061729, | |
| "grad_norm": 1.1794980764389038, | |
| "learning_rate": 0.00019013815409055896, | |
| "loss": 0.7987, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5185185185185185, | |
| "grad_norm": 1.16999351978302, | |
| "learning_rate": 0.0001895529941166909, | |
| "loss": 0.7939, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5308641975308642, | |
| "grad_norm": 1.077894926071167, | |
| "learning_rate": 0.00018895192472596426, | |
| "loss": 0.7979, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5432098765432098, | |
| "grad_norm": 1.169101595878601, | |
| "learning_rate": 0.0001883350527005541, | |
| "loss": 0.7817, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 1.211340069770813, | |
| "learning_rate": 0.00018770248763003134, | |
| "loss": 0.78, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5679012345679012, | |
| "grad_norm": 1.0432859659194946, | |
| "learning_rate": 0.00018705434189189376, | |
| "loss": 0.8006, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5802469135802469, | |
| "grad_norm": 1.0830694437026978, | |
| "learning_rate": 0.00018639073063160172, | |
| "loss": 0.7811, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 1.0481195449829102, | |
| "learning_rate": 0.00018571177174212214, | |
| "loss": 0.7438, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6049382716049383, | |
| "grad_norm": 1.0968844890594482, | |
| "learning_rate": 0.00018501758584298433, | |
| "loss": 0.7656, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6172839506172839, | |
| "grad_norm": 1.000929832458496, | |
| "learning_rate": 0.00018430829625885165, | |
| "loss": 0.789, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6296296296296297, | |
| "grad_norm": 1.0056663751602173, | |
| "learning_rate": 0.00018358402899761218, | |
| "loss": 0.7727, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6419753086419753, | |
| "grad_norm": 1.0271273851394653, | |
| "learning_rate": 0.00018284491272799327, | |
| "loss": 0.7723, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.654320987654321, | |
| "grad_norm": 1.0579137802124023, | |
| "learning_rate": 0.00018209107875670277, | |
| "loss": 0.7818, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.2076494693756104, | |
| "learning_rate": 0.00018132266100510214, | |
| "loss": 0.7709, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6790123456790124, | |
| "grad_norm": 1.0516587495803833, | |
| "learning_rate": 0.0001805397959854147, | |
| "loss": 0.7594, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.691358024691358, | |
| "grad_norm": 1.0052849054336548, | |
| "learning_rate": 0.00017974262277647374, | |
| "loss": 0.7645, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7037037037037037, | |
| "grad_norm": 1.067668080329895, | |
| "learning_rate": 0.00017893128299901472, | |
| "loss": 0.7508, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7160493827160493, | |
| "grad_norm": 1.1277140378952026, | |
| "learning_rate": 0.00017810592079051585, | |
| "loss": 0.7878, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7283950617283951, | |
| "grad_norm": 1.1608866453170776, | |
| "learning_rate": 0.00017726668277959136, | |
| "loss": 0.7638, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 1.102867603302002, | |
| "learning_rate": 0.00017641371805994264, | |
| "loss": 0.7618, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7530864197530864, | |
| "grad_norm": 1.1261731386184692, | |
| "learning_rate": 0.00017554717816387107, | |
| "loss": 0.758, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7654320987654321, | |
| "grad_norm": 1.066041111946106, | |
| "learning_rate": 0.00017466721703535764, | |
| "loss": 0.7478, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 1.021802544593811, | |
| "learning_rate": 0.0001737739910027145, | |
| "loss": 0.7554, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7901234567901234, | |
| "grad_norm": 1.1736857891082764, | |
| "learning_rate": 0.00017286765875081244, | |
| "loss": 0.78, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8024691358024691, | |
| "grad_norm": 1.1246825456619263, | |
| "learning_rate": 0.00017194838129289006, | |
| "loss": 0.7475, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8148148148148148, | |
| "grad_norm": 1.1927721500396729, | |
| "learning_rate": 0.0001710163219419491, | |
| "loss": 0.742, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8271604938271605, | |
| "grad_norm": 1.0868706703186035, | |
| "learning_rate": 0.00017007164628174139, | |
| "loss": 0.7676, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8395061728395061, | |
| "grad_norm": 1.0195647478103638, | |
| "learning_rate": 0.00016911452213735223, | |
| "loss": 0.7543, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8518518518518519, | |
| "grad_norm": 1.1533241271972656, | |
| "learning_rate": 0.00016814511954538558, | |
| "loss": 0.7463, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8641975308641975, | |
| "grad_norm": 1.1366993188858032, | |
| "learning_rate": 0.00016716361072375657, | |
| "loss": 0.7407, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8765432098765432, | |
| "grad_norm": 1.2098641395568848, | |
| "learning_rate": 0.00016617017004109632, | |
| "loss": 0.7031, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.9996205568313599, | |
| "learning_rate": 0.0001651649739857746, | |
| "loss": 0.7405, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9012345679012346, | |
| "grad_norm": 1.046208143234253, | |
| "learning_rate": 0.00016414820113454622, | |
| "loss": 0.763, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9135802469135802, | |
| "grad_norm": 1.0813542604446411, | |
| "learning_rate": 0.0001631200321208261, | |
| "loss": 0.7432, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 1.1473093032836914, | |
| "learning_rate": 0.00016208064960259897, | |
| "loss": 0.7738, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9382716049382716, | |
| "grad_norm": 1.084929347038269, | |
| "learning_rate": 0.00016103023822996982, | |
| "loss": 0.764, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9506172839506173, | |
| "grad_norm": 1.0776119232177734, | |
| "learning_rate": 0.00015996898461235977, | |
| "loss": 0.7496, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9629629629629629, | |
| "grad_norm": 1.1536927223205566, | |
| "learning_rate": 0.00015889707728535462, | |
| "loss": 0.7501, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9753086419753086, | |
| "grad_norm": 1.211068868637085, | |
| "learning_rate": 0.0001578147066772104, | |
| "loss": 0.7623, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 1.1335115432739258, | |
| "learning_rate": 0.00015672206507502337, | |
| "loss": 0.7261, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1170907020568848, | |
| "learning_rate": 0.00015561934659056947, | |
| "loss": 0.732, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7329609990119934, | |
| "eval_runtime": 487.7516, | |
| "eval_samples_per_second": 2.952, | |
| "eval_steps_per_second": 0.738, | |
| "step": 810 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2430, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4237273708455526e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |