| { | |
| "best_global_step": 810, | |
| "best_metric": 0.5652363896369934, | |
| "best_model_checkpoint": "./nepal-legal-model/checkpoint-810", | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 810, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012345679012345678, | |
| "grad_norm": 2.151370048522949, | |
| "learning_rate": 3.673469387755102e-05, | |
| "loss": 1.7959, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024691358024691357, | |
| "grad_norm": 1.6559878587722778, | |
| "learning_rate": 7.755102040816327e-05, | |
| "loss": 1.1927, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.037037037037037035, | |
| "grad_norm": 1.125174641609192, | |
| "learning_rate": 0.00011836734693877552, | |
| "loss": 0.9138, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04938271604938271, | |
| "grad_norm": 1.193498969078064, | |
| "learning_rate": 0.00015918367346938776, | |
| "loss": 0.8576, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06172839506172839, | |
| "grad_norm": 1.0680336952209473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7993, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07407407407407407, | |
| "grad_norm": 0.9959688782691956, | |
| "learning_rate": 0.00019998000585179915, | |
| "loss": 0.7726, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08641975308641975, | |
| "grad_norm": 0.9554275274276733, | |
| "learning_rate": 0.00019992003140251584, | |
| "loss": 0.7653, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09876543209876543, | |
| "grad_norm": 0.8977523446083069, | |
| "learning_rate": 0.00019982010063491056, | |
| "loss": 0.7866, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 0.8715363144874573, | |
| "learning_rate": 0.00019968025350959495, | |
| "loss": 0.743, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12345679012345678, | |
| "grad_norm": 0.9484530091285706, | |
| "learning_rate": 0.00019950054594905194, | |
| "loss": 0.7463, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13580246913580246, | |
| "grad_norm": 0.899090588092804, | |
| "learning_rate": 0.00019928104981527348, | |
| "loss": 0.7249, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 0.8685264587402344, | |
| "learning_rate": 0.0001990218528810242, | |
| "loss": 0.7151, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16049382716049382, | |
| "grad_norm": 0.8665175437927246, | |
| "learning_rate": 0.00019872305879474234, | |
| "loss": 0.6964, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1728395061728395, | |
| "grad_norm": 0.8796736598014832, | |
| "learning_rate": 0.00019838478703909282, | |
| "loss": 0.7099, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18518518518518517, | |
| "grad_norm": 0.8723923563957214, | |
| "learning_rate": 0.0001980071728831879, | |
| "loss": 0.7037, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19753086419753085, | |
| "grad_norm": 0.8163822889328003, | |
| "learning_rate": 0.0001975903673284955, | |
| "loss": 0.6695, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20987654320987653, | |
| "grad_norm": 0.8660425543785095, | |
| "learning_rate": 0.0001971345370484563, | |
| "loss": 0.6806, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.854213535785675, | |
| "learning_rate": 0.00019663986432183372, | |
| "loss": 0.7105, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2345679012345679, | |
| "grad_norm": 0.8273674249649048, | |
| "learning_rate": 0.0001961065469598239, | |
| "loss": 0.6901, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24691358024691357, | |
| "grad_norm": 0.8302693963050842, | |
| "learning_rate": 0.00019553479822695434, | |
| "loss": 0.6962, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.25925925925925924, | |
| "grad_norm": 0.9145434498786926, | |
| "learning_rate": 0.000194924846755803, | |
| "loss": 0.6889, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2716049382716049, | |
| "grad_norm": 0.842943012714386, | |
| "learning_rate": 0.0001942769364555721, | |
| "loss": 0.6856, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2839506172839506, | |
| "grad_norm": 0.8815113306045532, | |
| "learning_rate": 0.0001935913264145529, | |
| "loss": 0.6889, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.8023223280906677, | |
| "learning_rate": 0.0001928682907965207, | |
| "loss": 0.6955, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30864197530864196, | |
| "grad_norm": 0.7992237210273743, | |
| "learning_rate": 0.0001921081187311016, | |
| "loss": 0.6562, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.32098765432098764, | |
| "grad_norm": 0.8328211903572083, | |
| "learning_rate": 0.0001913111141981543, | |
| "loss": 0.6513, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.8767305612564087, | |
| "learning_rate": 0.00019047759590621374, | |
| "loss": 0.6897, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.345679012345679, | |
| "grad_norm": 0.8765648007392883, | |
| "learning_rate": 0.000189607897165045, | |
| "loss": 0.6516, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.35802469135802467, | |
| "grad_norm": 1.12288498878479, | |
| "learning_rate": 0.0001887023657523586, | |
| "loss": 0.6391, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.8884546756744385, | |
| "learning_rate": 0.00018776136377473982, | |
| "loss": 0.6613, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.38271604938271603, | |
| "grad_norm": 0.8964736461639404, | |
| "learning_rate": 0.00018678526752284857, | |
| "loss": 0.6629, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3950617283950617, | |
| "grad_norm": 0.8375449180603027, | |
| "learning_rate": 0.0001857744673209473, | |
| "loss": 0.6215, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4074074074074074, | |
| "grad_norm": 0.9215915203094482, | |
| "learning_rate": 0.00018472936737081672, | |
| "loss": 0.6443, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.41975308641975306, | |
| "grad_norm": 0.795101523399353, | |
| "learning_rate": 0.00018365038559012265, | |
| "loss": 0.6548, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.43209876543209874, | |
| "grad_norm": 0.8599629402160645, | |
| "learning_rate": 0.00018253795344529757, | |
| "loss": 0.6382, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.9184603691101074, | |
| "learning_rate": 0.0001813925157790049, | |
| "loss": 0.6254, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4567901234567901, | |
| "grad_norm": 0.8265050649642944, | |
| "learning_rate": 0.0001802145306322537, | |
| "loss": 0.6172, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4691358024691358, | |
| "grad_norm": 0.9218090772628784, | |
| "learning_rate": 0.00017900446906123603, | |
| "loss": 0.6319, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.48148148148148145, | |
| "grad_norm": 0.8071914315223694, | |
| "learning_rate": 0.00017776281494895956, | |
| "loss": 0.6195, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 0.8605950474739075, | |
| "learning_rate": 0.00017649006481175098, | |
| "loss": 0.6706, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5061728395061729, | |
| "grad_norm": 0.9902074337005615, | |
| "learning_rate": 0.00017518672760070763, | |
| "loss": 0.643, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5185185185185185, | |
| "grad_norm": 0.9705262184143066, | |
| "learning_rate": 0.00017385332449817656, | |
| "loss": 0.6433, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5308641975308642, | |
| "grad_norm": 0.8431525230407715, | |
| "learning_rate": 0.00017249038870934262, | |
| "loss": 0.6375, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5432098765432098, | |
| "grad_norm": 0.8588557243347168, | |
| "learning_rate": 0.00017109846524900887, | |
| "loss": 0.6143, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.8512520790100098, | |
| "learning_rate": 0.00016967811072365421, | |
| "loss": 0.6175, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5679012345679012, | |
| "grad_norm": 0.8612751960754395, | |
| "learning_rate": 0.0001682298931088563, | |
| "loss": 0.6428, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5802469135802469, | |
| "grad_norm": 0.8494360446929932, | |
| "learning_rate": 0.00016675439152216747, | |
| "loss": 0.6103, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.8205723166465759, | |
| "learning_rate": 0.0001652521959915356, | |
| "loss": 0.5897, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6049382716049383, | |
| "grad_norm": 0.900737464427948, | |
| "learning_rate": 0.00016372390721936198, | |
| "loss": 0.6038, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6172839506172839, | |
| "grad_norm": 0.7930091619491577, | |
| "learning_rate": 0.00016217013634229073, | |
| "loss": 0.6256, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6296296296296297, | |
| "grad_norm": 0.8809173703193665, | |
| "learning_rate": 0.00016059150468682558, | |
| "loss": 0.6111, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6419753086419753, | |
| "grad_norm": 0.8604996800422668, | |
| "learning_rate": 0.000158988643520872, | |
| "loss": 0.6153, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.654320987654321, | |
| "grad_norm": 0.8070431351661682, | |
| "learning_rate": 0.00015736219380130395, | |
| "loss": 0.6154, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.0792070627212524, | |
| "learning_rate": 0.0001557128059176561, | |
| "loss": 0.6152, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6790123456790124, | |
| "grad_norm": 0.8521204590797424, | |
| "learning_rate": 0.00015404113943204422, | |
| "loss": 0.5986, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.691358024691358, | |
| "grad_norm": 0.7541177272796631, | |
| "learning_rate": 0.00015234786281541736, | |
| "loss": 0.6048, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7037037037037037, | |
| "grad_norm": 0.850120484828949, | |
| "learning_rate": 0.0001506336531802479, | |
| "loss": 0.5929, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7160493827160493, | |
| "grad_norm": 0.8251618146896362, | |
| "learning_rate": 0.0001488991960097657, | |
| "loss": 0.6171, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7283950617283951, | |
| "grad_norm": 1.0346400737762451, | |
| "learning_rate": 0.00014714518488384513, | |
| "loss": 0.6045, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.829596221446991, | |
| "learning_rate": 0.000145372321201654, | |
| "loss": 0.5872, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7530864197530864, | |
| "grad_norm": 0.9026337265968323, | |
| "learning_rate": 0.00014358131390117645, | |
| "loss": 0.5964, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7654320987654321, | |
| "grad_norm": 0.8854096531867981, | |
| "learning_rate": 0.00014177287917572031, | |
| "loss": 0.579, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 0.7996705770492554, | |
| "learning_rate": 0.00013994774018752387, | |
| "loss": 0.5915, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7901234567901234, | |
| "grad_norm": 0.8649770021438599, | |
| "learning_rate": 0.00013810662677857547, | |
| "loss": 0.6079, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8024691358024691, | |
| "grad_norm": 0.8632396459579468, | |
| "learning_rate": 0.00013625027517876216, | |
| "loss": 0.5921, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8148148148148148, | |
| "grad_norm": 0.8673484325408936, | |
| "learning_rate": 0.00013437942771146388, | |
| "loss": 0.5807, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8271604938271605, | |
| "grad_norm": 0.8567239046096802, | |
| "learning_rate": 0.00013249483249671117, | |
| "loss": 0.6008, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8395061728395061, | |
| "grad_norm": 0.8294961452484131, | |
| "learning_rate": 0.00013059724315202443, | |
| "loss": 0.5972, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8518518518518519, | |
| "grad_norm": 0.9110690951347351, | |
| "learning_rate": 0.0001286874184910553, | |
| "loss": 0.5823, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8641975308641975, | |
| "grad_norm": 0.8697801828384399, | |
| "learning_rate": 0.00012676612222015, | |
| "loss": 0.5792, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8765432098765432, | |
| "grad_norm": 0.9493820071220398, | |
| "learning_rate": 0.00012483412263295603, | |
| "loss": 0.545, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.8390475511550903, | |
| "learning_rate": 0.0001228921923031948, | |
| "loss": 0.5844, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9012345679012346, | |
| "grad_norm": 0.8430746793746948, | |
| "learning_rate": 0.00012094110777572256, | |
| "loss": 0.6021, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9135802469135802, | |
| "grad_norm": 0.7902705073356628, | |
| "learning_rate": 0.00011898164925600315, | |
| "loss": 0.585, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 0.87884521484375, | |
| "learning_rate": 0.00011701460029811733, | |
| "loss": 0.5998, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9382716049382716, | |
| "grad_norm": 0.924493134021759, | |
| "learning_rate": 0.00011504074749143269, | |
| "loss": 0.5945, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9506172839506173, | |
| "grad_norm": 0.8170025944709778, | |
| "learning_rate": 0.00011306088014606018, | |
| "loss": 0.5814, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9629629629629629, | |
| "grad_norm": 0.9573339819908142, | |
| "learning_rate": 0.00011107578997722219, | |
| "loss": 0.577, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9753086419753086, | |
| "grad_norm": 0.8860905766487122, | |
| "learning_rate": 0.00010908627078865927, | |
| "loss": 0.5972, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 0.8376012444496155, | |
| "learning_rate": 0.00010709311815520151, | |
| "loss": 0.5554, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8038386106491089, | |
| "learning_rate": 0.00010509712910463174, | |
| "loss": 0.5622, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.5652363896369934, | |
| "eval_runtime": 496.7418, | |
| "eval_samples_per_second": 2.899, | |
| "eval_steps_per_second": 0.725, | |
| "step": 810 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4237273708455526e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |