MrMoeeee
/

lamp-qwen-training

+{
+  "best_global_step": 550,
+  "best_metric": 0.027798546478152275,
+  "best_model_checkpoint": "/workspace/lampAI/finetuning/outputs/lamp-qwen-1.5b-full/checkpoint-550",
+  "epoch": 5.634920634920634,
+  "eval_steps": 50,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.07054673721340388,
+      "grad_norm": 4.5,
+      "learning_rate": 1.267605633802817e-05,
+      "loss": 1.1558,
+      "step": 10
+    },
+    {
+      "epoch": 0.14109347442680775,
+      "grad_norm": 1.1953125,
+      "learning_rate": 2.676056338028169e-05,
+      "loss": 0.4274,
+      "step": 20
+    },
+    {
+      "epoch": 0.21164021164021163,
+      "grad_norm": 0.302734375,
+      "learning_rate": 4.0845070422535214e-05,
+      "loss": 0.0549,
+      "step": 30
+    },
+    {
+      "epoch": 0.2821869488536155,
+      "grad_norm": 1.21875,
+      "learning_rate": 5.492957746478874e-05,
+      "loss": 0.0471,
+      "step": 40
+    },
+    {
+      "epoch": 0.3527336860670194,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 6.901408450704226e-05,
+      "loss": 0.0402,
+      "step": 50
+    },
+    {
+      "epoch": 0.3527336860670194,
+      "eval_loss": 0.03969533368945122,
+      "eval_runtime": 6.3012,
+      "eval_samples_per_second": 40.151,
+      "eval_steps_per_second": 10.157,
+      "step": 50
+    },
+    {
+      "epoch": 0.42328042328042326,
+      "grad_norm": 0.193359375,
+      "learning_rate": 8.309859154929578e-05,
+      "loss": 0.0377,
+      "step": 60
+    },
+    {
+      "epoch": 0.49382716049382713,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 9.718309859154931e-05,
+      "loss": 0.0343,
+      "step": 70
+    },
+    {
+      "epoch": 0.564373897707231,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.00011126760563380282,
+      "loss": 0.0356,
+      "step": 80
+    },
+    {
+      "epoch": 0.6349206349206349,
+      "grad_norm": 3.1875,
+      "learning_rate": 0.00012535211267605635,
+      "loss": 0.0855,
+      "step": 90
+    },
+    {
+      "epoch": 0.7054673721340388,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00013943661971830987,
+      "loss": 0.0868,
+      "step": 100
+    },
+    {
+      "epoch": 0.7054673721340388,
+      "eval_loss": 0.05627487599849701,
+      "eval_runtime": 6.311,
+      "eval_samples_per_second": 40.089,
+      "eval_steps_per_second": 10.141,
+      "step": 100
+    },
+    {
+      "epoch": 0.7760141093474426,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.00015352112676056339,
+      "loss": 0.0538,
+      "step": 110
+    },
+    {
+      "epoch": 0.8465608465608465,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001676056338028169,
+      "loss": 0.0561,
+      "step": 120
+    },
+    {
+      "epoch": 0.9171075837742504,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.00018169014084507045,
+      "loss": 0.0641,
+      "step": 130
+    },
+    {
+      "epoch": 0.9876543209876543,
+      "grad_norm": 37.0,
+      "learning_rate": 0.00019577464788732396,
+      "loss": 0.0863,
+      "step": 140
+    },
+    {
+      "epoch": 1.056437389770723,
+      "grad_norm": 1.0546875,
+      "learning_rate": 0.00019999667815369528,
+      "loss": 0.3147,
+      "step": 150
+    },
+    {
+      "epoch": 1.056437389770723,
+      "eval_loss": 0.10409189015626907,
+      "eval_runtime": 6.0488,
+      "eval_samples_per_second": 41.826,
+      "eval_steps_per_second": 10.581,
+      "step": 150
+    },
+    {
+      "epoch": 1.126984126984127,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00019998040841735952,
+      "loss": 0.0868,
+      "step": 160
+    },
+    {
+      "epoch": 1.1975308641975309,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00019995058285912832,
+      "loss": 0.0527,
+      "step": 170
+    },
+    {
+      "epoch": 1.2680776014109347,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.00019990720552289347,
+      "loss": 0.0418,
+      "step": 180
+    },
+    {
+      "epoch": 1.3386243386243386,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00019985028228996173,
+      "loss": 0.0328,
+      "step": 190
+    },
+    {
+      "epoch": 1.4091710758377425,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.00019977982087825713,
+      "loss": 0.0389,
+      "step": 200
+    },
+    {
+      "epoch": 1.4091710758377425,
+      "eval_loss": 0.03491974622011185,
+      "eval_runtime": 6.042,
+      "eval_samples_per_second": 41.874,
+      "eval_steps_per_second": 10.593,
+      "step": 200
+    },
+    {
+      "epoch": 1.4797178130511464,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00019969583084127485,
+      "loss": 0.0338,
+      "step": 210
+    },
+    {
+      "epoch": 1.5502645502645502,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00019959832356678583,
+      "loss": 0.033,
+      "step": 220
+    },
+    {
+      "epoch": 1.620811287477954,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.00019948731227529258,
+      "loss": 0.0313,
+      "step": 230
+    },
+    {
+      "epoch": 1.691358024691358,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.00019936281201823688,
+      "loss": 0.0288,
+      "step": 240
+    },
+    {
+      "epoch": 1.7619047619047619,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.00019922483967595893,
+      "loss": 0.0322,
+      "step": 250
+    },
+    {
+      "epoch": 1.7619047619047619,
+      "eval_loss": 0.03154641017317772,
+      "eval_runtime": 6.0077,
+      "eval_samples_per_second": 42.113,
+      "eval_steps_per_second": 10.653,
+      "step": 250
+    },
+    {
+      "epoch": 1.8324514991181657,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.00019907341395540877,
+      "loss": 0.0279,
+      "step": 260
+    },
+    {
+      "epoch": 1.9029982363315696,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.00019890855538760974,
+      "loss": 0.0293,
+      "step": 270
+    },
+    {
+      "epoch": 1.9735449735449735,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00019873028632487474,
+      "loss": 0.029,
+      "step": 280
+    },
+    {
+      "epoch": 2.0423280423280423,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001985386309377759,
+      "loss": 0.0265,
+      "step": 290
+    },
+    {
+      "epoch": 2.112874779541446,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001983336152118671,
+      "loss": 0.0256,
+      "step": 300
+    },
+    {
+      "epoch": 2.112874779541446,
+      "eval_loss": 0.029803331941366196,
+      "eval_runtime": 5.9968,
+      "eval_samples_per_second": 42.189,
+      "eval_steps_per_second": 10.672,
+      "step": 300
+    },
+    {
+      "epoch": 2.18342151675485,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001981152669441609,
+      "loss": 0.0247,
+      "step": 310
+    },
+    {
+      "epoch": 2.253968253968254,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.00019788361573935958,
+      "loss": 0.0257,
+      "step": 320
+    },
+    {
+      "epoch": 2.324514991181658,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.00019763869300584128,
+      "loss": 0.0254,
+      "step": 330
+    },
+    {
+      "epoch": 2.3950617283950617,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.00019738053195140148,
+      "loss": 0.023,
+      "step": 340
+    },
+    {
+      "epoch": 2.4656084656084656,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00019710916757875052,
+      "loss": 0.0243,
+      "step": 350
+    },
+    {
+      "epoch": 2.4656084656084656,
+      "eval_loss": 0.029007520526647568,
+      "eval_runtime": 6.0663,
+      "eval_samples_per_second": 41.706,
+      "eval_steps_per_second": 10.55,
+      "step": 350
+    },
+    {
+      "epoch": 2.5361552028218695,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001968246366807677,
+      "loss": 0.0232,
+      "step": 360
+    },
+    {
+      "epoch": 2.6067019400352733,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001965269778355129,
+      "loss": 0.0253,
+      "step": 370
+    },
+    {
+      "epoch": 2.677248677248677,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.00019621623140099578,
+      "loss": 0.0253,
+      "step": 380
+    },
+    {
+      "epoch": 2.747795414462081,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.00019589243950970402,
+      "loss": 0.0231,
+      "step": 390
+    },
+    {
+      "epoch": 2.818342151675485,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001955556460628906,
+      "loss": 0.0237,
+      "step": 400
+    },
+    {
+      "epoch": 2.818342151675485,
+      "eval_loss": 0.027823707088828087,
+      "eval_runtime": 6.0126,
+      "eval_samples_per_second": 42.078,
+      "eval_steps_per_second": 10.644,
+      "step": 400
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001952058967246217,
+      "loss": 0.025,
+      "step": 410
+    },
+    {
+      "epoch": 2.9594356261022927,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.00019484323891558506,
+      "loss": 0.0238,
+      "step": 420
+    },
+    {
+      "epoch": 3.0282186948853616,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.00019446772180666084,
+      "loss": 0.0206,
+      "step": 430
+    },
+    {
+      "epoch": 3.0987654320987654,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.00019407939631225439,
+      "loss": 0.019,
+      "step": 440
+    },
+    {
+      "epoch": 3.1693121693121693,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.00019367831508339327,
+      "loss": 0.0186,
+      "step": 450
+    },
+    {
+      "epoch": 3.1693121693121693,
+      "eval_loss": 0.029222065582871437,
+      "eval_runtime": 5.9948,
+      "eval_samples_per_second": 42.203,
+      "eval_steps_per_second": 10.676,
+      "step": 450
+    },
+    {
+      "epoch": 3.239858906525573,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001932645325005885,
+      "loss": 0.0167,
+      "step": 460
+    },
+    {
+      "epoch": 3.310405643738977,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001928381046664615,
+      "loss": 0.0208,
+      "step": 470
+    },
+    {
+      "epoch": 3.380952380952381,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.00019239908939813722,
+      "loss": 0.0197,
+      "step": 480
+    },
+    {
+      "epoch": 3.451499118165785,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001919475462194052,
+      "loss": 0.0199,
+      "step": 490
+    },
+    {
+      "epoch": 3.5220458553791887,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.00019148353635264895,
+      "loss": 0.0209,
+      "step": 500
+    },
+    {
+      "epoch": 3.5220458553791887,
+      "eval_loss": 0.028565241023898125,
+      "eval_runtime": 6.5453,
+      "eval_samples_per_second": 38.654,
+      "eval_steps_per_second": 9.778,
+      "step": 500
+    },
+    {
+      "epoch": 3.5925925925925926,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.00019100712271054516,
+      "loss": 0.0215,
+      "step": 510
+    },
+    {
+      "epoch": 3.6631393298059964,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.00019051836988753372,
+      "loss": 0.022,
+      "step": 520
+    },
+    {
+      "epoch": 3.7336860670194003,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001900173441510597,
+      "loss": 0.02,
+      "step": 530
+    },
+    {
+      "epoch": 3.804232804232804,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00018950411343258842,
+      "loss": 0.0201,
+      "step": 540
+    },
+    {
+      "epoch": 3.874779541446208,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.00018897874731839504,
+      "loss": 0.0208,
+      "step": 550
+    },
+    {
+      "epoch": 3.874779541446208,
+      "eval_loss": 0.027798546478152275,
+      "eval_runtime": 6.1662,
+      "eval_samples_per_second": 41.03,
+      "eval_steps_per_second": 10.379,
+      "step": 550
+    },
+    {
+      "epoch": 3.945326278659612,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.00018844131704012968,
+      "loss": 0.0202,
+      "step": 560
+    },
+    {
+      "epoch": 4.01410934744268,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.00018789189546515958,
+      "loss": 0.0199,
+      "step": 570
+    },
+    {
+      "epoch": 4.084656084656085,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00018733055708668926,
+      "loss": 0.0143,
+      "step": 580
+    },
+    {
+      "epoch": 4.155202821869489,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.00018675737801366056,
+      "loss": 0.0141,
+      "step": 590
+    },
+    {
+      "epoch": 4.225749559082892,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.00018617243596043314,
+      "loss": 0.0158,
+      "step": 600
+    },
+    {
+      "epoch": 4.225749559082892,
+      "eval_loss": 0.02914673648774624,
+      "eval_runtime": 6.1782,
+      "eval_samples_per_second": 40.95,
+      "eval_steps_per_second": 10.359,
+      "step": 600
+    },
+    {
+      "epoch": 4.296296296296296,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.00018557581023624788,
+      "loss": 0.0158,
+      "step": 610
+    },
+    {
+      "epoch": 4.3668430335097,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.00018496758173447368,
+      "loss": 0.0166,
+      "step": 620
+    },
+    {
+      "epoch": 4.4373897707231045,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001843478329216394,
+      "loss": 0.0171,
+      "step": 630
+    },
+    {
+      "epoch": 4.507936507936508,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.00018371664782625287,
+      "loss": 0.0164,
+      "step": 640
+    },
+    {
+      "epoch": 4.578483245149911,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.00018307411202740773,
+      "loss": 0.0175,
+      "step": 650
+    },
+    {
+      "epoch": 4.578483245149911,
+      "eval_loss": 0.02929055318236351,
+      "eval_runtime": 6.0557,
+      "eval_samples_per_second": 41.779,
+      "eval_steps_per_second": 10.568,
+      "step": 650
+    },
+    {
+      "epoch": 4.649029982363316,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.00018242031264318026,
+      "loss": 0.0179,
+      "step": 660
+    },
+    {
+      "epoch": 4.71957671957672,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.00018175533831881757,
+      "loss": 0.0175,
+      "step": 670
+    },
+    {
+      "epoch": 4.790123456790123,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001810792792147186,
+      "loss": 0.0177,
+      "step": 680
+    },
+    {
+      "epoch": 4.860670194003527,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.00018039222699420965,
+      "loss": 0.0177,
+      "step": 690
+    },
+    {
+      "epoch": 4.931216931216931,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.00017969427481111643,
+      "loss": 0.0184,
+      "step": 700
+    },
+    {
+      "epoch": 4.931216931216931,
+      "eval_loss": 0.028933366760611534,
+      "eval_runtime": 6.028,
+      "eval_samples_per_second": 41.971,
+      "eval_steps_per_second": 10.617,
+      "step": 700
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.00017898551729713362,
+      "loss": 0.0166,
+      "step": 710
+    },
+    {
+      "epoch": 5.070546737213403,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.00017826605054899433,
+      "loss": 0.0125,
+      "step": 720
+    },
+    {
+      "epoch": 5.141093474426808,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.00017753597211544092,
+      "loss": 0.0139,
+      "step": 730
+    },
+    {
+      "epoch": 5.211640211640212,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001767953809839987,
+      "loss": 0.0136,
+      "step": 740
+    },
+    {
+      "epoch": 5.2821869488536155,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00017604437756755498,
+      "loss": 0.014,
+      "step": 750
+    },
+    {
+      "epoch": 5.2821869488536155,
+      "eval_loss": 0.030887732282280922,
+      "eval_runtime": 6.0316,
+      "eval_samples_per_second": 41.946,
+      "eval_steps_per_second": 10.611,
+      "step": 750
+    },
+    {
+      "epoch": 5.352733686067019,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001752830636907443,
+      "loss": 0.0139,
+      "step": 760
+    },
+    {
+      "epoch": 5.423280423280423,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.00017451154257614287,
+      "loss": 0.0142,
+      "step": 770
+    },
+    {
+      "epoch": 5.493827160493828,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00017372991883027287,
+      "loss": 0.0142,
+      "step": 780
+    },
+    {
+      "epoch": 5.564373897707231,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.00017293829842941972,
+      "loss": 0.0146,
+      "step": 790
+    },
+    {
+      "epoch": 5.634920634920634,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.00017213678870526292,
+      "loss": 0.0147,
+      "step": 800
+    },
+    {
+      "epoch": 5.634920634920634,
+      "eval_loss": 0.02992367185652256,
+      "eval_runtime": 6.0048,
+      "eval_samples_per_second": 42.133,
+      "eval_steps_per_second": 10.658,
+      "step": 800
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2840,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 5
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8482800577851085e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}