{ "best_global_step": 810, "best_metric": 0.5652363896369934, "best_model_checkpoint": "./nepal-legal-model/checkpoint-810", "epoch": 1.0, "eval_steps": 500, "global_step": 810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012345679012345678, "grad_norm": 2.151370048522949, "learning_rate": 3.673469387755102e-05, "loss": 1.7959, "step": 10 }, { "epoch": 0.024691358024691357, "grad_norm": 1.6559878587722778, "learning_rate": 7.755102040816327e-05, "loss": 1.1927, "step": 20 }, { "epoch": 0.037037037037037035, "grad_norm": 1.125174641609192, "learning_rate": 0.00011836734693877552, "loss": 0.9138, "step": 30 }, { "epoch": 0.04938271604938271, "grad_norm": 1.193498969078064, "learning_rate": 0.00015918367346938776, "loss": 0.8576, "step": 40 }, { "epoch": 0.06172839506172839, "grad_norm": 1.0680336952209473, "learning_rate": 0.0002, "loss": 0.7993, "step": 50 }, { "epoch": 0.07407407407407407, "grad_norm": 0.9959688782691956, "learning_rate": 0.00019998000585179915, "loss": 0.7726, "step": 60 }, { "epoch": 0.08641975308641975, "grad_norm": 0.9554275274276733, "learning_rate": 0.00019992003140251584, "loss": 0.7653, "step": 70 }, { "epoch": 0.09876543209876543, "grad_norm": 0.8977523446083069, "learning_rate": 0.00019982010063491056, "loss": 0.7866, "step": 80 }, { "epoch": 0.1111111111111111, "grad_norm": 0.8715363144874573, "learning_rate": 0.00019968025350959495, "loss": 0.743, "step": 90 }, { "epoch": 0.12345679012345678, "grad_norm": 0.9484530091285706, "learning_rate": 0.00019950054594905194, "loss": 0.7463, "step": 100 }, { "epoch": 0.13580246913580246, "grad_norm": 0.899090588092804, "learning_rate": 0.00019928104981527348, "loss": 0.7249, "step": 110 }, { "epoch": 0.14814814814814814, "grad_norm": 0.8685264587402344, "learning_rate": 0.0001990218528810242, "loss": 0.7151, "step": 120 }, { "epoch": 0.16049382716049382, "grad_norm": 0.8665175437927246, "learning_rate": 0.00019872305879474234, "loss": 0.6964, "step": 130 }, { "epoch": 0.1728395061728395, "grad_norm": 0.8796736598014832, "learning_rate": 0.00019838478703909282, "loss": 0.7099, "step": 140 }, { "epoch": 0.18518518518518517, "grad_norm": 0.8723923563957214, "learning_rate": 0.0001980071728831879, "loss": 0.7037, "step": 150 }, { "epoch": 0.19753086419753085, "grad_norm": 0.8163822889328003, "learning_rate": 0.0001975903673284955, "loss": 0.6695, "step": 160 }, { "epoch": 0.20987654320987653, "grad_norm": 0.8660425543785095, "learning_rate": 0.0001971345370484563, "loss": 0.6806, "step": 170 }, { "epoch": 0.2222222222222222, "grad_norm": 0.854213535785675, "learning_rate": 0.00019663986432183372, "loss": 0.7105, "step": 180 }, { "epoch": 0.2345679012345679, "grad_norm": 0.8273674249649048, "learning_rate": 0.0001961065469598239, "loss": 0.6901, "step": 190 }, { "epoch": 0.24691358024691357, "grad_norm": 0.8302693963050842, "learning_rate": 0.00019553479822695434, "loss": 0.6962, "step": 200 }, { "epoch": 0.25925925925925924, "grad_norm": 0.9145434498786926, "learning_rate": 0.000194924846755803, "loss": 0.6889, "step": 210 }, { "epoch": 0.2716049382716049, "grad_norm": 0.842943012714386, "learning_rate": 0.0001942769364555721, "loss": 0.6856, "step": 220 }, { "epoch": 0.2839506172839506, "grad_norm": 0.8815113306045532, "learning_rate": 0.0001935913264145529, "loss": 0.6889, "step": 230 }, { "epoch": 0.2962962962962963, "grad_norm": 0.8023223280906677, "learning_rate": 0.0001928682907965207, "loss": 0.6955, "step": 240 }, { "epoch": 0.30864197530864196, "grad_norm": 0.7992237210273743, "learning_rate": 0.0001921081187311016, "loss": 0.6562, "step": 250 }, { "epoch": 0.32098765432098764, "grad_norm": 0.8328211903572083, "learning_rate": 0.0001913111141981543, "loss": 0.6513, "step": 260 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8767305612564087, "learning_rate": 0.00019047759590621374, "loss": 0.6897, "step": 270 }, { "epoch": 0.345679012345679, "grad_norm": 0.8765648007392883, "learning_rate": 0.000189607897165045, "loss": 0.6516, "step": 280 }, { "epoch": 0.35802469135802467, "grad_norm": 1.12288498878479, "learning_rate": 0.0001887023657523586, "loss": 0.6391, "step": 290 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8884546756744385, "learning_rate": 0.00018776136377473982, "loss": 0.6613, "step": 300 }, { "epoch": 0.38271604938271603, "grad_norm": 0.8964736461639404, "learning_rate": 0.00018678526752284857, "loss": 0.6629, "step": 310 }, { "epoch": 0.3950617283950617, "grad_norm": 0.8375449180603027, "learning_rate": 0.0001857744673209473, "loss": 0.6215, "step": 320 }, { "epoch": 0.4074074074074074, "grad_norm": 0.9215915203094482, "learning_rate": 0.00018472936737081672, "loss": 0.6443, "step": 330 }, { "epoch": 0.41975308641975306, "grad_norm": 0.795101523399353, "learning_rate": 0.00018365038559012265, "loss": 0.6548, "step": 340 }, { "epoch": 0.43209876543209874, "grad_norm": 0.8599629402160645, "learning_rate": 0.00018253795344529757, "loss": 0.6382, "step": 350 }, { "epoch": 0.4444444444444444, "grad_norm": 0.9184603691101074, "learning_rate": 0.0001813925157790049, "loss": 0.6254, "step": 360 }, { "epoch": 0.4567901234567901, "grad_norm": 0.8265050649642944, "learning_rate": 0.0001802145306322537, "loss": 0.6172, "step": 370 }, { "epoch": 0.4691358024691358, "grad_norm": 0.9218090772628784, "learning_rate": 0.00017900446906123603, "loss": 0.6319, "step": 380 }, { "epoch": 0.48148148148148145, "grad_norm": 0.8071914315223694, "learning_rate": 0.00017776281494895956, "loss": 0.6195, "step": 390 }, { "epoch": 0.49382716049382713, "grad_norm": 0.8605950474739075, "learning_rate": 0.00017649006481175098, "loss": 0.6706, "step": 400 }, { "epoch": 0.5061728395061729, "grad_norm": 0.9902074337005615, "learning_rate": 0.00017518672760070763, "loss": 0.643, "step": 410 }, { "epoch": 0.5185185185185185, "grad_norm": 0.9705262184143066, "learning_rate": 0.00017385332449817656, "loss": 0.6433, "step": 420 }, { "epoch": 0.5308641975308642, "grad_norm": 0.8431525230407715, "learning_rate": 0.00017249038870934262, "loss": 0.6375, "step": 430 }, { "epoch": 0.5432098765432098, "grad_norm": 0.8588557243347168, "learning_rate": 0.00017109846524900887, "loss": 0.6143, "step": 440 }, { "epoch": 0.5555555555555556, "grad_norm": 0.8512520790100098, "learning_rate": 0.00016967811072365421, "loss": 0.6175, "step": 450 }, { "epoch": 0.5679012345679012, "grad_norm": 0.8612751960754395, "learning_rate": 0.0001682298931088563, "loss": 0.6428, "step": 460 }, { "epoch": 0.5802469135802469, "grad_norm": 0.8494360446929932, "learning_rate": 0.00016675439152216747, "loss": 0.6103, "step": 470 }, { "epoch": 0.5925925925925926, "grad_norm": 0.8205723166465759, "learning_rate": 0.0001652521959915356, "loss": 0.5897, "step": 480 }, { "epoch": 0.6049382716049383, "grad_norm": 0.900737464427948, "learning_rate": 0.00016372390721936198, "loss": 0.6038, "step": 490 }, { "epoch": 0.6172839506172839, "grad_norm": 0.7930091619491577, "learning_rate": 0.00016217013634229073, "loss": 0.6256, "step": 500 }, { "epoch": 0.6296296296296297, "grad_norm": 0.8809173703193665, "learning_rate": 0.00016059150468682558, "loss": 0.6111, "step": 510 }, { "epoch": 0.6419753086419753, "grad_norm": 0.8604996800422668, "learning_rate": 0.000158988643520872, "loss": 0.6153, "step": 520 }, { "epoch": 0.654320987654321, "grad_norm": 0.8070431351661682, "learning_rate": 0.00015736219380130395, "loss": 0.6154, "step": 530 }, { "epoch": 0.6666666666666666, "grad_norm": 1.0792070627212524, "learning_rate": 0.0001557128059176561, "loss": 0.6152, "step": 540 }, { "epoch": 0.6790123456790124, "grad_norm": 0.8521204590797424, "learning_rate": 0.00015404113943204422, "loss": 0.5986, "step": 550 }, { "epoch": 0.691358024691358, "grad_norm": 0.7541177272796631, "learning_rate": 0.00015234786281541736, "loss": 0.6048, "step": 560 }, { "epoch": 0.7037037037037037, "grad_norm": 0.850120484828949, "learning_rate": 0.0001506336531802479, "loss": 0.5929, "step": 570 }, { "epoch": 0.7160493827160493, "grad_norm": 0.8251618146896362, "learning_rate": 0.0001488991960097657, "loss": 0.6171, "step": 580 }, { "epoch": 0.7283950617283951, "grad_norm": 1.0346400737762451, "learning_rate": 0.00014714518488384513, "loss": 0.6045, "step": 590 }, { "epoch": 0.7407407407407407, "grad_norm": 0.829596221446991, "learning_rate": 0.000145372321201654, "loss": 0.5872, "step": 600 }, { "epoch": 0.7530864197530864, "grad_norm": 0.9026337265968323, "learning_rate": 0.00014358131390117645, "loss": 0.5964, "step": 610 }, { "epoch": 0.7654320987654321, "grad_norm": 0.8854096531867981, "learning_rate": 0.00014177287917572031, "loss": 0.579, "step": 620 }, { "epoch": 0.7777777777777778, "grad_norm": 0.7996705770492554, "learning_rate": 0.00013994774018752387, "loss": 0.5915, "step": 630 }, { "epoch": 0.7901234567901234, "grad_norm": 0.8649770021438599, "learning_rate": 0.00013810662677857547, "loss": 0.6079, "step": 640 }, { "epoch": 0.8024691358024691, "grad_norm": 0.8632396459579468, "learning_rate": 0.00013625027517876216, "loss": 0.5921, "step": 650 }, { "epoch": 0.8148148148148148, "grad_norm": 0.8673484325408936, "learning_rate": 0.00013437942771146388, "loss": 0.5807, "step": 660 }, { "epoch": 0.8271604938271605, "grad_norm": 0.8567239046096802, "learning_rate": 0.00013249483249671117, "loss": 0.6008, "step": 670 }, { "epoch": 0.8395061728395061, "grad_norm": 0.8294961452484131, "learning_rate": 0.00013059724315202443, "loss": 0.5972, "step": 680 }, { "epoch": 0.8518518518518519, "grad_norm": 0.9110690951347351, "learning_rate": 0.0001286874184910553, "loss": 0.5823, "step": 690 }, { "epoch": 0.8641975308641975, "grad_norm": 0.8697801828384399, "learning_rate": 0.00012676612222015, "loss": 0.5792, "step": 700 }, { "epoch": 0.8765432098765432, "grad_norm": 0.9493820071220398, "learning_rate": 0.00012483412263295603, "loss": 0.545, "step": 710 }, { "epoch": 0.8888888888888888, "grad_norm": 0.8390475511550903, "learning_rate": 0.0001228921923031948, "loss": 0.5844, "step": 720 }, { "epoch": 0.9012345679012346, "grad_norm": 0.8430746793746948, "learning_rate": 0.00012094110777572256, "loss": 0.6021, "step": 730 }, { "epoch": 0.9135802469135802, "grad_norm": 0.7902705073356628, "learning_rate": 0.00011898164925600315, "loss": 0.585, "step": 740 }, { "epoch": 0.9259259259259259, "grad_norm": 0.87884521484375, "learning_rate": 0.00011701460029811733, "loss": 0.5998, "step": 750 }, { "epoch": 0.9382716049382716, "grad_norm": 0.924493134021759, "learning_rate": 0.00011504074749143269, "loss": 0.5945, "step": 760 }, { "epoch": 0.9506172839506173, "grad_norm": 0.8170025944709778, "learning_rate": 0.00011306088014606018, "loss": 0.5814, "step": 770 }, { "epoch": 0.9629629629629629, "grad_norm": 0.9573339819908142, "learning_rate": 0.00011107578997722219, "loss": 0.577, "step": 780 }, { "epoch": 0.9753086419753086, "grad_norm": 0.8860905766487122, "learning_rate": 0.00010908627078865927, "loss": 0.5972, "step": 790 }, { "epoch": 0.9876543209876543, "grad_norm": 0.8376012444496155, "learning_rate": 0.00010709311815520151, "loss": 0.5554, "step": 800 }, { "epoch": 1.0, "grad_norm": 0.8038386106491089, "learning_rate": 0.00010509712910463174, "loss": 0.5622, "step": 810 }, { "epoch": 1.0, "eval_loss": 0.5652363896369934, "eval_runtime": 496.7418, "eval_samples_per_second": 2.899, "eval_steps_per_second": 0.725, "step": 810 } ], "logging_steps": 10, "max_steps": 1620, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4237273708455526e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }