Training in progress, epoch 10, checkpoint

Browse files

Files changed (9) hide show

last-checkpoint/config.json +2 -1
last-checkpoint/model-00001-of-00003.safetensors +1 -1
last-checkpoint/model-00002-of-00003.safetensors +1 -1
last-checkpoint/model-00003-of-00003.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +482 -523
last-checkpoint/training_args.bin +1 -1

last-checkpoint/config.json CHANGED Viewed

@@ -4,7 +4,7 @@
     "Phi4MMForCausalLM"
   ],
   "attention_bias": false,
-  "attention_dropout": 0.0,
   "audio_processor": {
     "config": {
       "activation": "swish",
@@ -53,6 +53,7 @@
     "AutoTokenizer": "microsoft/Phi-4-multimodal-instruct--Xenova/gpt-4o"
   },
   "bos_token_id": 199999,
   "embd_layer": {
     "audio_embd_layer": {
       "compression_rate": 8,

     "Phi4MMForCausalLM"
   ],
   "attention_bias": false,
+  "attention_dropout": 0.1,
   "audio_processor": {
     "config": {
       "activation": "swish",
     "AutoTokenizer": "microsoft/Phi-4-multimodal-instruct--Xenova/gpt-4o"
   },
   "bos_token_id": 199999,
+  "dropout": 0.1,
   "embd_layer": {
     "audio_embd_layer": {
       "compression_rate": 8,

last-checkpoint/model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2dba202c4bd78baf747939330a784ddb3edf466e2590e23ae264ea8c0bf8af4e
 size 4998420448

 version https://git-lfs.github.com/spec/v1
+oid sha256:b938ce88ce1cffbca9406119f3bf3d6a8c7c221672c70d7f6a014ed31288208e
 size 4998420448

last-checkpoint/model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dbb84b667bd07926038796d7587907cbfcbc94dad52669f9e5e32c125b107d10
 size 4983891952

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5b3aa7d526316649da9fe097edf86c134909ce89af2c5e4be748b52708a2e5b
 size 4983891952

last-checkpoint/model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b0f9f026114b452f2afea019b04b6f49b6c86c4f8aa8ad865e681b8a1634355
 size 1905111704

 version https://git-lfs.github.com/spec/v1
+oid sha256:115e7c916a84bb2ca309529ea182bfd9a9fc6603de3b3220122143164643d022
 size 1905111704

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87c6c051e11b415870596076dfafbffe58ceab60bb8ee77b20d335152338d956
 size 15344257558

 version https://git-lfs.github.com/spec/v1
+oid sha256:4503be4bed2bdc196f9aa5b48728866e8bb65dd630a8dd2e33efcd021e17858d
 size 15344257558

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c70a41264c08a1e8401b7173fe9901cfca41eb5cf987bd975ed722bcda9db818
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac0389b5da961b38667013030da96e0e998cdc2366307000dfb275a026d99b15
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:017002c2d629e5f1c7bb7237618315ba17edcbbd589c1b4ae48239e8f9a1d79f
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:782bc8820f2e383abba755c44f38af6b90e6ed80083b23c764e330fe7a1a700b
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,837 +1,796 @@
 {
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 9.0,
   "eval_steps": 500,
-  "global_step": 2682,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.08389261744966443,
-      "grad_norm": 52.25,
-      "learning_rate": 5.555555555555557e-06,
-      "loss": 4.2261,
       "step": 25
     },
     {
-      "epoch": 0.16778523489932887,
-      "grad_norm": 30.625,
-      "learning_rate": 1.1111111111111113e-05,
-      "loss": 2.4269,
       "step": 50
     },
     {
-      "epoch": 0.2516778523489933,
-      "grad_norm": 25.5,
-      "learning_rate": 1.6666666666666667e-05,
-      "loss": 1.6724,
       "step": 75
     },
     {
-      "epoch": 0.33557046979865773,
-      "grad_norm": 18.0,
-      "learning_rate": 1.9999409160138695e-05,
-      "loss": 1.6634,
       "step": 100
     },
     {
-      "epoch": 0.41946308724832215,
-      "grad_norm": 19.375,
-      "learning_rate": 1.9992763013493023e-05,
-      "loss": 1.4662,
       "step": 125
     },
     {
-      "epoch": 0.5033557046979866,
-      "grad_norm": 16.25,
-      "learning_rate": 1.9978737094995525e-05,
-      "loss": 1.5513,
       "step": 150
     },
     {
-      "epoch": 0.587248322147651,
-      "grad_norm": 18.125,
-      "learning_rate": 1.9957341762950346e-05,
-      "loss": 1.4649,
       "step": 175
     },
     {
-      "epoch": 0.6711409395973155,
-      "grad_norm": 17.0,
-      "learning_rate": 1.992859281805935e-05,
-      "loss": 1.4342,
       "step": 200
     },
     {
-      "epoch": 0.7550335570469798,
-      "grad_norm": 15.375,
-      "learning_rate": 1.9892511491753126e-05,
-      "loss": 1.4756,
       "step": 225
     },
     {
-      "epoch": 0.8389261744966443,
-      "grad_norm": 19.0,
-      "learning_rate": 1.984912443051131e-05,
-      "loss": 1.4059,
       "step": 250
     },
     {
-      "epoch": 0.9228187919463087,
-      "grad_norm": 13.75,
-      "learning_rate": 1.9798463676183887e-05,
-      "loss": 1.437,
-      "step": 275
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 1.4382814168930054,
-      "eval_runtime": 1.8726,
-      "eval_samples_per_second": 67.287,
-      "eval_steps_per_second": 8.544,
-      "step": 298
     },
     {
-      "epoch": 1.0067114093959733,
-      "grad_norm": 11.0625,
-      "learning_rate": 1.9740566642327868e-05,
-      "loss": 1.3453,
       "step": 300
     },
     {
-      "epoch": 1.0906040268456376,
-      "grad_norm": 16.875,
-      "learning_rate": 1.967547608657697e-05,
-      "loss": 1.0968,
       "step": 325
     },
     {
-      "epoch": 1.174496644295302,
-      "grad_norm": 17.0,
-      "learning_rate": 1.9603240079064605e-05,
-      "loss": 1.0313,
       "step": 350
     },
     {
-      "epoch": 1.2583892617449663,
-      "grad_norm": 19.0,
-      "learning_rate": 1.9523911966923506e-05,
-      "loss": 1.1095,
       "step": 375
     },
     {
-      "epoch": 1.342281879194631,
-      "grad_norm": 17.625,
-      "learning_rate": 1.9437550334888277e-05,
-      "loss": 1.1064,
       "step": 400
     },
     {
-      "epoch": 1.4261744966442953,
-      "grad_norm": 17.125,
-      "learning_rate": 1.9344218962029856e-05,
-      "loss": 1.0744,
       "step": 425
     },
     {
-      "epoch": 1.5100671140939599,
-      "grad_norm": 15.0625,
-      "learning_rate": 1.9243986774653954e-05,
-      "loss": 1.0505,
       "step": 450
     },
     {
-      "epoch": 1.5939597315436242,
-      "grad_norm": 16.375,
-      "learning_rate": 1.9136927795398158e-05,
-      "loss": 1.0781,
       "step": 475
     },
     {
-      "epoch": 1.6778523489932886,
-      "grad_norm": 10.625,
-      "learning_rate": 1.9023121088565353e-05,
-      "loss": 1.0273,
       "step": 500
     },
     {
-      "epoch": 1.761744966442953,
-      "grad_norm": 14.0625,
-      "learning_rate": 1.890265070173382e-05,
-      "loss": 1.1687,
       "step": 525
     },
     {
-      "epoch": 1.8456375838926173,
-      "grad_norm": 19.375,
-      "learning_rate": 1.8775605603687128e-05,
-      "loss": 1.0875,
       "step": 550
     },
     {
-      "epoch": 1.929530201342282,
-      "grad_norm": 18.75,
-      "learning_rate": 1.8642079618709627e-05,
-      "loss": 1.0308,
       "step": 575
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 1.3985741138458252,
-      "eval_runtime": 1.8989,
-      "eval_samples_per_second": 66.353,
-      "eval_steps_per_second": 8.426,
-      "step": 596
-    },
-    {
-      "epoch": 2.0134228187919465,
-      "grad_norm": 14.125,
-      "learning_rate": 1.8502171357296144e-05,
-      "loss": 1.0441,
       "step": 600
     },
     {
-      "epoch": 2.097315436241611,
-      "grad_norm": 15.5625,
-      "learning_rate": 1.8355984143326968e-05,
-      "loss": 0.7089,
       "step": 625
     },
     {
-      "epoch": 2.1812080536912752,
-      "grad_norm": 17.5,
-      "learning_rate": 1.820362593776198e-05,
-      "loss": 0.6969,
       "step": 650
     },
     {
-      "epoch": 2.2651006711409396,
-      "grad_norm": 17.125,
-      "learning_rate": 1.804520925891021e-05,
-      "loss": 0.7308,
       "step": 675
     },
     {
-      "epoch": 2.348993288590604,
-      "grad_norm": 19.5,
-      "learning_rate": 1.7880851099333762e-05,
-      "loss": 0.7553,
       "step": 700
     },
     {
-      "epoch": 2.4328859060402683,
-      "grad_norm": 18.375,
-      "learning_rate": 1.7710672839447442e-05,
-      "loss": 0.6881,
       "step": 725
     },
     {
-      "epoch": 2.5167785234899327,
-      "grad_norm": 16.375,
-      "learning_rate": 1.753480015787792e-05,
-      "loss": 0.744,
       "step": 750
     },
     {
-      "epoch": 2.600671140939597,
-      "grad_norm": 23.375,
-      "learning_rate": 1.735336293864857e-05,
-      "loss": 0.751,
       "step": 775
     },
     {
-      "epoch": 2.684563758389262,
-      "grad_norm": 15.5,
-      "learning_rate": 1.7166495175258654e-05,
-      "loss": 0.6958,
       "step": 800
     },
     {
-      "epoch": 2.7684563758389262,
-      "grad_norm": 25.25,
-      "learning_rate": 1.697433487172752e-05,
-      "loss": 0.7711,
       "step": 825
     },
     {
-      "epoch": 2.8523489932885906,
-      "grad_norm": 13.0625,
-      "learning_rate": 1.6777023940677036e-05,
-      "loss": 0.7639,
       "step": 850
     },
     {
-      "epoch": 2.936241610738255,
-      "grad_norm": 16.25,
-      "learning_rate": 1.657470809852749e-05,
-      "loss": 0.8271,
       "step": 875
     },
     {
-      "epoch": 3.0,
-      "eval_loss": 1.5429246425628662,
-      "eval_runtime": 1.949,
-      "eval_samples_per_second": 64.648,
-      "eval_steps_per_second": 8.209,
-      "step": 894
-    },
-    {
-      "epoch": 3.0201342281879193,
-      "grad_norm": 15.5,
-      "learning_rate": 1.6367536757884285e-05,
-      "loss": 0.6906,
       "step": 900
     },
     {
-      "epoch": 3.1040268456375837,
-      "grad_norm": 21.75,
-      "learning_rate": 1.615566291719502e-05,
-      "loss": 0.4225,
       "step": 925
     },
     {
-      "epoch": 3.1879194630872485,
-      "grad_norm": 18.0,
-      "learning_rate": 1.5939243047758312e-05,
-      "loss": 0.4291,
       "step": 950
     },
     {
-      "epoch": 3.271812080536913,
-      "grad_norm": 26.75,
-      "learning_rate": 1.5718436978167976e-05,
-      "loss": 0.4474,
       "step": 975
     },
     {
-      "epoch": 3.3557046979865772,
-      "grad_norm": 20.25,
-      "learning_rate": 1.54934077762777e-05,
-      "loss": 0.4227,
       "step": 1000
     },
     {
-      "epoch": 3.4395973154362416,
-      "grad_norm": 18.5,
-      "learning_rate": 1.526432162877356e-05,
-      "loss": 0.4062,
       "step": 1025
     },
     {
-      "epoch": 3.523489932885906,
-      "grad_norm": 13.3125,
-      "learning_rate": 1.5031347718443212e-05,
-      "loss": 0.4282,
       "step": 1050
     },
     {
-      "epoch": 3.6073825503355703,
-      "grad_norm": 12.875,
-      "learning_rate": 1.4794658099232426e-05,
-      "loss": 0.451,
       "step": 1075
     },
     {
-      "epoch": 3.6912751677852347,
-      "grad_norm": 18.0,
-      "learning_rate": 1.455442756918126e-05,
-      "loss": 0.407,
       "step": 1100
     },
     {
-      "epoch": 3.7751677852348995,
-      "grad_norm": 22.375,
-      "learning_rate": 1.4310833541333658e-05,
-      "loss": 0.4382,
       "step": 1125
     },
     {
-      "epoch": 3.859060402684564,
-      "grad_norm": 23.0,
-      "learning_rate": 1.4064055912715846e-05,
-      "loss": 0.4321,
       "step": 1150
     },
     {
-      "epoch": 3.942953020134228,
-      "grad_norm": 18.375,
-      "learning_rate": 1.3814276931480308e-05,
-      "loss": 0.4301,
       "step": 1175
     },
     {
-      "epoch": 4.0,
-      "eval_loss": 1.8369454145431519,
-      "eval_runtime": 1.9833,
-      "eval_samples_per_second": 63.53,
-      "eval_steps_per_second": 8.067,
-      "step": 1192
-    },
-    {
-      "epoch": 4.026845637583893,
-      "grad_norm": 25.25,
-      "learning_rate": 1.356168106231337e-05,
-      "loss": 0.3562,
       "step": 1200
     },
     {
-      "epoch": 4.110738255033557,
-      "grad_norm": 18.625,
-      "learning_rate": 1.3306454850205914e-05,
-      "loss": 0.2158,
       "step": 1225
     },
     {
-      "epoch": 4.194630872483222,
-      "grad_norm": 23.875,
-      "learning_rate": 1.3048786782687706e-05,
-      "loss": 0.1972,
       "step": 1250
     },
     {
-      "epoch": 4.278523489932886,
-      "grad_norm": 17.0,
-      "learning_rate": 1.2788867150627163e-05,
-      "loss": 0.1911,
       "step": 1275
     },
     {
-      "epoch": 4.3624161073825505,
-      "grad_norm": 13.6875,
-      "learning_rate": 1.2526887907699349e-05,
-      "loss": 0.2341,
       "step": 1300
     },
     {
-      "epoch": 4.446308724832215,
-      "grad_norm": 21.5,
-      "learning_rate": 1.2263042528625928e-05,
-      "loss": 0.1984,
       "step": 1325
     },
     {
-      "epoch": 4.530201342281879,
-      "grad_norm": 24.25,
-      "learning_rate": 1.1997525866291842e-05,
-      "loss": 0.1688,
       "step": 1350
     },
     {
-      "epoch": 4.614093959731544,
-      "grad_norm": 23.375,
-      "learning_rate": 1.1730534007844186e-05,
-      "loss": 0.1869,
       "step": 1375
     },
     {
-      "epoch": 4.697986577181208,
-      "grad_norm": 16.75,
-      "learning_rate": 1.1462264129879555e-05,
-      "loss": 0.2089,
       "step": 1400
     },
     {
-      "epoch": 4.781879194630872,
-      "grad_norm": 20.25,
-      "learning_rate": 1.1192914352826849e-05,
-      "loss": 0.1923,
       "step": 1425
     },
     {
-      "epoch": 4.865771812080537,
-      "grad_norm": 46.25,
-      "learning_rate": 1.092268359463302e-05,
-      "loss": 0.1786,
       "step": 1450
     },
     {
-      "epoch": 4.949664429530201,
-      "grad_norm": 19.25,
-      "learning_rate": 1.0651771423859845e-05,
-      "loss": 0.2009,
       "step": 1475
     },
     {
-      "epoch": 5.0,
-      "eval_loss": 2.166541337966919,
-      "eval_runtime": 1.83,
-      "eval_samples_per_second": 68.853,
-      "eval_steps_per_second": 8.743,
-      "step": 1490
     },
     {
-      "epoch": 5.033557046979865,
-      "grad_norm": 13.4375,
-      "learning_rate": 1.0380377912300231e-05,
-      "loss": 0.158,
-      "step": 1500
     },
     {
-      "epoch": 5.117449664429531,
-      "grad_norm": 6.5,
-      "learning_rate": 1.0108703487222855e-05,
-      "loss": 0.0834,
       "step": 1525
     },
     {
-      "epoch": 5.201342281879195,
-      "grad_norm": 14.9375,
-      "learning_rate": 9.836948783354308e-06,
-      "loss": 0.0662,
       "step": 1550
     },
     {
-      "epoch": 5.285234899328859,
-      "grad_norm": 11.125,
-      "learning_rate": 9.565314494707995e-06,
-      "loss": 0.0883,
       "step": 1575
     },
     {
-      "epoch": 5.369127516778524,
-      "grad_norm": 15.375,
-      "learning_rate": 9.294001226369281e-06,
-      "loss": 0.0734,
       "step": 1600
     },
     {
-      "epoch": 5.453020134228188,
-      "grad_norm": 8.75,
-      "learning_rate": 9.023209346346293e-06,
-      "loss": 0.079,
       "step": 1625
     },
     {
-      "epoch": 5.5369127516778525,
-      "grad_norm": 35.25,
-      "learning_rate": 8.753138837595818e-06,
-      "loss": 0.0793,
       "step": 1650
     },
     {
-      "epoch": 5.620805369127517,
-      "grad_norm": 14.0,
-      "learning_rate": 8.483989150333556e-06,
-      "loss": 0.0793,
       "step": 1675
     },
     {
-      "epoch": 5.704697986577181,
-      "grad_norm": 32.5,
-      "learning_rate": 8.215959054737817e-06,
-      "loss": 0.0683,
       "step": 1700
     },
     {
-      "epoch": 5.7885906040268456,
-      "grad_norm": 8.625,
-      "learning_rate": 7.94924649415542e-06,
-      "loss": 0.0788,
       "step": 1725
     },
     {
-      "epoch": 5.87248322147651,
-      "grad_norm": 7.8125,
-      "learning_rate": 7.684048438918247e-06,
-      "loss": 0.0699,
       "step": 1750
     },
     {
-      "epoch": 5.956375838926174,
-      "grad_norm": 7.625,
-      "learning_rate": 7.420560740878335e-06,
-      "loss": 0.0988,
-      "step": 1775
     },
     {
-      "epoch": 6.0,
-      "eval_loss": 2.644843578338623,
-      "eval_runtime": 1.8563,
-      "eval_samples_per_second": 67.878,
-      "eval_steps_per_second": 8.619,
-      "step": 1788
     },
     {
-      "epoch": 6.040268456375839,
-      "grad_norm": 5.03125,
-      "learning_rate": 7.1589779887690235e-06,
-      "loss": 0.0675,
       "step": 1800
     },
     {
-      "epoch": 6.124161073825503,
-      "grad_norm": 5.9375,
-      "learning_rate": 6.899493364498884e-06,
-      "loss": 0.0397,
       "step": 1825
     },
     {
-      "epoch": 6.208053691275167,
-      "grad_norm": 11.5625,
-      "learning_rate": 6.642298500484657e-06,
-      "loss": 0.0314,
       "step": 1850
     },
     {
-      "epoch": 6.291946308724833,
-      "grad_norm": 5.21875,
-      "learning_rate": 6.387583338128471e-06,
-      "loss": 0.0401,
       "step": 1875
     },
     {
-      "epoch": 6.375838926174497,
-      "grad_norm": 7.5,
-      "learning_rate": 6.1355359875438995e-06,
-      "loss": 0.0286,
       "step": 1900
     },
     {
-      "epoch": 6.459731543624161,
-      "grad_norm": 8.0,
-      "learning_rate": 5.886342588634458e-06,
-      "loss": 0.0374,
       "step": 1925
     },
     {
-      "epoch": 6.543624161073826,
-      "grad_norm": 24.5,
-      "learning_rate": 5.64018717362711e-06,
-      "loss": 0.0404,
       "step": 1950
     },
     {
-      "epoch": 6.62751677852349,
-      "grad_norm": 16.75,
-      "learning_rate": 5.397251531162332e-06,
-      "loss": 0.0256,
       "step": 1975
     },
     {
-      "epoch": 6.7114093959731544,
-      "grad_norm": 6.0,
-      "learning_rate": 5.157715072041094e-06,
-      "loss": 0.037,
       "step": 2000
     },
     {
-      "epoch": 6.795302013422819,
-      "grad_norm": 5.9375,
-      "learning_rate": 4.92175469672787e-06,
-      "loss": 0.0365,
       "step": 2025
     },
     {
-      "epoch": 6.879194630872483,
-      "grad_norm": 6.28125,
-      "learning_rate": 4.6895446647076e-06,
-      "loss": 0.0288,
       "step": 2050
     },
     {
-      "epoch": 6.9630872483221475,
-      "grad_norm": 19.375,
-      "learning_rate": 4.461256465793033e-06,
-      "loss": 0.0324,
       "step": 2075
     },
     {
-      "epoch": 7.0,
-      "eval_loss": 2.95961856842041,
-      "eval_runtime": 1.9194,
-      "eval_samples_per_second": 65.646,
-      "eval_steps_per_second": 8.336,
-      "step": 2086
-    },
-    {
-      "epoch": 7.046979865771812,
-      "grad_norm": 2.34375,
-      "learning_rate": 4.237058693477499e-06,
-      "loss": 0.0205,
       "step": 2100
     },
     {
-      "epoch": 7.130872483221476,
-      "grad_norm": 2.140625,
-      "learning_rate": 4.017116920426652e-06,
-      "loss": 0.018,
       "step": 2125
     },
     {
-      "epoch": 7.214765100671141,
-      "grad_norm": 3.203125,
-      "learning_rate": 3.801593576201118e-06,
-      "loss": 0.022,
       "step": 2150
     },
     {
-      "epoch": 7.298657718120805,
-      "grad_norm": 7.28125,
-      "learning_rate": 3.5906478273004053e-06,
-      "loss": 0.0222,
       "step": 2175
     },
     {
-      "epoch": 7.382550335570469,
-      "grad_norm": 3.375,
-      "learning_rate": 3.3844354596165364e-06,
-      "loss": 0.0203,
       "step": 2200
     },
     {
-      "epoch": 7.466442953020135,
-      "grad_norm": 7.28125,
-      "learning_rate": 3.183108763384415e-06,
-      "loss": 0.0183,
       "step": 2225
     },
     {
-      "epoch": 7.550335570469799,
-      "grad_norm": 6.8125,
-      "learning_rate": 2.986816420713662e-06,
-      "loss": 0.0174,
       "step": 2250
     },
     {
-      "epoch": 7.634228187919463,
-      "grad_norm": 3.34375,
-      "learning_rate": 2.795703395785184e-06,
-      "loss": 0.0178,
       "step": 2275
     },
     {
-      "epoch": 7.718120805369128,
-      "grad_norm": 3.53125,
-      "learning_rate": 2.6099108277934105e-06,
-      "loss": 0.0186,
       "step": 2300
     },
     {
-      "epoch": 7.802013422818792,
-      "grad_norm": 5.28125,
-      "learning_rate": 2.42957592671337e-06,
-      "loss": 0.0204,
       "step": 2325
     },
     {
-      "epoch": 7.885906040268456,
-      "grad_norm": 2.625,
-      "learning_rate": 2.2548318719695182e-06,
-      "loss": 0.0195,
       "step": 2350
     },
     {
-      "epoch": 7.969798657718121,
-      "grad_norm": 3.640625,
-      "learning_rate": 2.085807714081195e-06,
-      "loss": 0.0182,
       "step": 2375
     },
     {
-      "epoch": 8.0,
-      "eval_loss": 3.16278076171875,
-      "eval_runtime": 1.9451,
-      "eval_samples_per_second": 64.779,
-      "eval_steps_per_second": 8.226,
-      "step": 2384
-    },
-    {
-      "epoch": 8.053691275167786,
-      "grad_norm": 1.9296875,
-      "learning_rate": 1.9226282793572927e-06,
-      "loss": 0.0132,
       "step": 2400
     },
     {
-      "epoch": 8.13758389261745,
-      "grad_norm": 1.28125,
-      "learning_rate": 1.7654140777105954e-06,
-      "loss": 0.0139,
       "step": 2425
     },
     {
-      "epoch": 8.221476510067115,
-      "grad_norm": 1.640625,
-      "learning_rate": 1.6142812136597852e-06,
-      "loss": 0.0125,
       "step": 2450
     },
     {
-      "epoch": 8.305369127516778,
-      "grad_norm": 1.9375,
-      "learning_rate": 1.4693413005849143e-06,
-      "loss": 0.0198,
       "step": 2475
     },
     {
-      "epoch": 8.389261744966444,
-      "grad_norm": 2.28125,
-      "learning_rate": 1.3307013782996237e-06,
-      "loss": 0.0158,
       "step": 2500
     },
     {
-      "epoch": 8.473154362416107,
-      "grad_norm": 4.53125,
-      "learning_rate": 1.1984638340009935e-06,
-      "loss": 0.0159,
-      "step": 2525
-    },
-    {
-      "epoch": 8.557046979865772,
-      "grad_norm": 3.203125,
-      "learning_rate": 1.0727263266554012e-06,
-      "loss": 0.0189,
-      "step": 2550
-    },
-    {
-      "epoch": 8.640939597315436,
-      "grad_norm": 4.28125,
-      "learning_rate": 9.535817148762461e-07,
-      "loss": 0.0157,
-      "step": 2575
-    },
-    {
-      "epoch": 8.724832214765101,
-      "grad_norm": 2.625,
-      "learning_rate": 8.411179883467668e-07,
-      "loss": 0.0115,
-      "step": 2600
-    },
-    {
-      "epoch": 8.808724832214764,
-      "grad_norm": 2.3125,
-      "learning_rate": 7.354182028386591e-07,
-      "loss": 0.0162,
-      "step": 2625
-    },
-    {
-      "epoch": 8.89261744966443,
-      "grad_norm": 2.8125,
-      "learning_rate": 6.365604188743979e-07,
-      "loss": 0.0178,
-      "step": 2650
-    },
-    {
-      "epoch": 8.976510067114093,
-      "grad_norm": 2.4375,
-      "learning_rate": 5.446176440786488e-07,
-      "loss": 0.0139,
-      "step": 2675
-    },
-    {
-      "epoch": 9.0,
-      "eval_loss": 3.2047908306121826,
-      "eval_runtime": 1.8248,
-      "eval_samples_per_second": 69.048,
-      "eval_steps_per_second": 8.768,
-      "step": 2682
     }
   ],
   "logging_steps": 25,
-  "max_steps": 2980,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 500,
@@ -842,12 +801,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 1.0370740667256422e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 1.580127239227295,
+  "best_model_checkpoint": "/home/azureuser/models/grpo/checkpoint-2510",
+  "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 2510,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.099601593625498,
+      "grad_norm": 240.0,
+      "learning_rate": 1.99203187250996e-06,
+      "loss": 17.7499,
       "step": 25
     },
     {
+      "epoch": 0.199203187250996,
+      "grad_norm": 162.0,
+      "learning_rate": 3.98406374501992e-06,
+      "loss": 15.9447,
       "step": 50
     },
     {
+      "epoch": 0.29880478087649404,
+      "grad_norm": 68.5,
+      "learning_rate": 5.976095617529881e-06,
+      "loss": 10.3904,
       "step": 75
     },
     {
+      "epoch": 0.398406374501992,
+      "grad_norm": 91.5,
+      "learning_rate": 7.96812749003984e-06,
+      "loss": 6.0541,
       "step": 100
     },
     {
+      "epoch": 0.49800796812749004,
+      "grad_norm": 112.0,
+      "learning_rate": 9.960159362549801e-06,
+      "loss": 3.2563,
       "step": 125
     },
     {
+      "epoch": 0.5976095617529881,
+      "grad_norm": 48.25,
+      "learning_rate": 1.1952191235059762e-05,
+      "loss": 2.3041,
       "step": 150
     },
     {
+      "epoch": 0.6972111553784861,
+      "grad_norm": 15.625,
+      "learning_rate": 1.3944223107569724e-05,
+      "loss": 2.0339,
       "step": 175
     },
     {
+      "epoch": 0.796812749003984,
+      "grad_norm": 12.0625,
+      "learning_rate": 1.593625498007968e-05,
+      "loss": 1.8795,
       "step": 200
     },
     {
+      "epoch": 0.896414342629482,
+      "grad_norm": 9.8125,
+      "learning_rate": 1.7928286852589643e-05,
+      "loss": 1.7841,
       "step": 225
     },
     {
+      "epoch": 0.9960159362549801,
+      "grad_norm": 6.1875,
+      "learning_rate": 1.9920318725099602e-05,
+      "loss": 1.7073,
       "step": 250
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 1.6925737857818604,
+      "eval_runtime": 7.1264,
+      "eval_samples_per_second": 70.443,
+      "eval_steps_per_second": 8.84,
+      "step": 251
     },
     {
+      "epoch": 1.095617529880478,
+      "grad_norm": 5.40625,
+      "learning_rate": 1.9994430458382323e-05,
+      "loss": 1.669,
+      "step": 275
     },
     {
+      "epoch": 1.1952191235059761,
+      "grad_norm": 2.125,
+      "learning_rate": 1.997679073527335e-05,
+      "loss": 1.6468,
       "step": 300
     },
     {
+      "epoch": 1.294820717131474,
+      "grad_norm": 6.5,
+      "learning_rate": 1.9947092480832322e-05,
+      "loss": 1.6281,
       "step": 325
     },
     {
+      "epoch": 1.3944223107569722,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.9905371590102157e-05,
+      "loss": 1.6223,
       "step": 350
     },
     {
+      "epoch": 1.4940239043824701,
+      "grad_norm": 1.0546875,
+      "learning_rate": 1.98516784893854e-05,
+      "loss": 1.6118,
       "step": 375
     },
     {
+      "epoch": 1.593625498007968,
+      "grad_norm": 3.75,
+      "learning_rate": 1.978607807529606e-05,
+      "loss": 1.6065,
       "step": 400
     },
     {
+      "epoch": 1.6932270916334662,
+      "grad_norm": 4.4375,
+      "learning_rate": 1.9708649636321745e-05,
+      "loss": 1.6031,
       "step": 425
     },
     {
+      "epoch": 1.792828685258964,
+      "grad_norm": 1.703125,
+      "learning_rate": 1.961948675699101e-05,
+      "loss": 1.6012,
       "step": 450
     },
     {
+      "epoch": 1.8924302788844622,
+      "grad_norm": 1.484375,
+      "learning_rate": 1.9518697204761604e-05,
+      "loss": 1.5998,
       "step": 475
     },
     {
+      "epoch": 1.9920318725099602,
+      "grad_norm": 0.953125,
+      "learning_rate": 1.9406402799766452e-05,
+      "loss": 1.5987,
       "step": 500
     },
     {
+      "epoch": 2.0,
+      "eval_loss": 1.596346139907837,
+      "eval_runtime": 7.2817,
+      "eval_samples_per_second": 68.94,
+      "eval_steps_per_second": 8.652,
+      "step": 502
+    },
+    {
+      "epoch": 2.091633466135458,
+      "grad_norm": 0.75390625,
+      "learning_rate": 1.928273926757472e-05,
+      "loss": 1.5935,
       "step": 525
     },
     {
+      "epoch": 2.191235059760956,
+      "grad_norm": 1.1953125,
+      "learning_rate": 1.914785607514599e-05,
+      "loss": 1.5895,
       "step": 550
     },
     {
+      "epoch": 2.2908366533864544,
+      "grad_norm": 0.482421875,
+      "learning_rate": 1.9001916250175764e-05,
+      "loss": 1.5891,
       "step": 575
     },
     {
+      "epoch": 2.3904382470119523,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.8845096184050684e-05,
+      "loss": 1.5868,
       "step": 600
     },
     {
+      "epoch": 2.49003984063745,
+      "grad_norm": 1.7265625,
+      "learning_rate": 1.86775854186516e-05,
+      "loss": 1.587,
       "step": 625
     },
     {
+      "epoch": 2.589641434262948,
+      "grad_norm": 1.0390625,
+      "learning_rate": 1.849958641726221e-05,
+      "loss": 1.5856,
       "step": 650
     },
     {
+      "epoch": 2.6892430278884465,
+      "grad_norm": 2.875,
+      "learning_rate": 1.831131431986012e-05,
+      "loss": 1.5835,
       "step": 675
     },
     {
+      "epoch": 2.7888446215139444,
+      "grad_norm": 1.828125,
+      "learning_rate": 1.8112996683086102e-05,
+      "loss": 1.5826,
       "step": 700
     },
     {
+      "epoch": 2.8884462151394423,
+      "grad_norm": 0.57421875,
+      "learning_rate": 1.7904873205205886e-05,
+      "loss": 1.5831,
       "step": 725
     },
     {
+      "epoch": 2.9880478087649402,
+      "grad_norm": 0.47265625,
+      "learning_rate": 1.7687195436396835e-05,
+      "loss": 1.5826,
       "step": 750
     },
     {
+      "epoch": 3.0,
+      "eval_loss": 1.5846781730651855,
+      "eval_runtime": 7.5283,
+      "eval_samples_per_second": 66.682,
+      "eval_steps_per_second": 8.368,
+      "step": 753
+    },
+    {
+      "epoch": 3.087649402390438,
+      "grad_norm": 0.375,
+      "learning_rate": 1.7460226474709726e-05,
+      "loss": 1.5809,
       "step": 775
     },
     {
+      "epoch": 3.187250996015936,
+      "grad_norm": 1.5703125,
+      "learning_rate": 1.7224240648073097e-05,
+      "loss": 1.5803,
       "step": 800
     },
     {
+      "epoch": 3.2868525896414345,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.6979523182724514e-05,
+      "loss": 1.5802,
       "step": 825
     },
     {
+      "epoch": 3.3864541832669324,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.672636985846951e-05,
+      "loss": 1.5789,
       "step": 850
     },
     {
+      "epoch": 3.4860557768924303,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.6465086651184826e-05,
+      "loss": 1.5788,
       "step": 875
     },
     {
+      "epoch": 3.585657370517928,
+      "grad_norm": 0.60546875,
+      "learning_rate": 1.6195989362998137e-05,
+      "loss": 1.5785,
       "step": 900
     },
     {
+      "epoch": 3.685258964143426,
+      "grad_norm": 1.0078125,
+      "learning_rate": 1.591940324059117e-05,
+      "loss": 1.5784,
       "step": 925
     },
     {
+      "epoch": 3.7848605577689245,
+      "grad_norm": 0.58984375,
+      "learning_rate": 1.5635662582087604e-05,
+      "loss": 1.5779,
       "step": 950
     },
     {
+      "epoch": 3.8844621513944224,
+      "grad_norm": 0.455078125,
+      "learning_rate": 1.534511033300083e-05,
+      "loss": 1.5783,
       "step": 975
     },
     {
+      "epoch": 3.9840637450199203,
+      "grad_norm": 0.3359375,
+      "learning_rate": 1.5048097671730015e-05,
+      "loss": 1.5769,
       "step": 1000
     },
     {
+      "epoch": 4.0,
+      "eval_loss": 1.581375002861023,
+      "eval_runtime": 7.1884,
+      "eval_samples_per_second": 69.834,
+      "eval_steps_per_second": 8.764,
+      "step": 1004
+    },
+    {
+      "epoch": 4.083665338645418,
+      "grad_norm": 0.365234375,
+      "learning_rate": 1.4744983585105388e-05,
+      "loss": 1.576,
       "step": 1025
     },
     {
+      "epoch": 4.183266932270916,
+      "grad_norm": 0.57421875,
+      "learning_rate": 1.4436134434495825e-05,
+      "loss": 1.5763,
       "step": 1050
     },
     {
+      "epoch": 4.282868525896414,
+      "grad_norm": 0.93359375,
+      "learning_rate": 1.412192351300312e-05,
+      "loss": 1.5763,
       "step": 1075
     },
     {
+      "epoch": 4.382470119521912,
+      "grad_norm": 0.466796875,
+      "learning_rate": 1.3802730594278161e-05,
+      "loss": 1.5761,
       "step": 1100
     },
     {
+      "epoch": 4.482071713147411,
+      "grad_norm": 0.5390625,
+      "learning_rate": 1.3478941473504346e-05,
+      "loss": 1.5761,
       "step": 1125
     },
     {
+      "epoch": 4.581673306772909,
+      "grad_norm": 0.35546875,
+      "learning_rate": 1.315094750110301e-05,
+      "loss": 1.5763,
       "step": 1150
     },
     {
+      "epoch": 4.681274900398407,
+      "grad_norm": 0.287109375,
+      "learning_rate": 1.2819145109724476e-05,
+      "loss": 1.5755,
       "step": 1175
     },
     {
+      "epoch": 4.780876494023905,
+      "grad_norm": 0.25390625,
+      "learning_rate": 1.2483935335096434e-05,
+      "loss": 1.5758,
       "step": 1200
     },
     {
+      "epoch": 4.8804780876494025,
+      "grad_norm": 0.298828125,
+      "learning_rate": 1.2145723331308752e-05,
+      "loss": 1.5758,
       "step": 1225
     },
     {
+      "epoch": 4.9800796812749,
+      "grad_norm": 0.49609375,
+      "learning_rate": 1.1804917881120608e-05,
+      "loss": 1.576,
       "step": 1250
     },
     {
+      "epoch": 5.0,
+      "eval_loss": 1.5807639360427856,
+      "eval_runtime": 7.4787,
+      "eval_samples_per_second": 67.124,
+      "eval_steps_per_second": 8.424,
+      "step": 1255
+    },
+    {
+      "epoch": 5.079681274900398,
+      "grad_norm": 0.3671875,
+      "learning_rate": 1.1461930901881812e-05,
+      "loss": 1.5752,
       "step": 1275
     },
     {
+      "epoch": 5.179282868525896,
+      "grad_norm": 0.482421875,
+      "learning_rate": 1.111717694766545e-05,
+      "loss": 1.5749,
       "step": 1300
     },
     {
+      "epoch": 5.278884462151394,
+      "grad_norm": 0.359375,
+      "learning_rate": 1.0771072708213652e-05,
+      "loss": 1.575,
       "step": 1325
     },
     {
+      "epoch": 5.378486055776892,
+      "grad_norm": 0.255859375,
+      "learning_rate": 1.0424036505302062e-05,
+      "loss": 1.5749,
       "step": 1350
     },
     {
+      "epoch": 5.47808764940239,
+      "grad_norm": 0.36328125,
+      "learning_rate": 1.0076487787131726e-05,
+      "loss": 1.5751,
       "step": 1375
     },
     {
+      "epoch": 5.577689243027889,
+      "grad_norm": 0.296875,
+      "learning_rate": 9.728846621359538e-06,
+      "loss": 1.5757,
       "step": 1400
     },
     {
+      "epoch": 5.677290836653387,
+      "grad_norm": 0.375,
+      "learning_rate": 9.381533187379958e-06,
+      "loss": 1.5743,
       "step": 1425
     },
     {
+      "epoch": 5.776892430278885,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 9.034967268471674e-06,
+      "loss": 1.5748,
       "step": 1450
     },
     {
+      "epoch": 5.876494023904383,
+      "grad_norm": 0.421875,
+      "learning_rate": 8.68956774442306e-06,
+      "loss": 1.5753,
       "step": 1475
     },
     {
+      "epoch": 5.9760956175298805,
+      "grad_norm": 0.28515625,
+      "learning_rate": 8.345752085249603e-06,
+      "loss": 1.5741,
+      "step": 1500
     },
     {
+      "epoch": 6.0,
+      "eval_loss": 1.5804367065429688,
+      "eval_runtime": 7.4746,
+      "eval_samples_per_second": 67.161,
+      "eval_steps_per_second": 8.429,
+      "step": 1506
     },
     {
+      "epoch": 6.075697211155378,
+      "grad_norm": 0.6640625,
+      "learning_rate": 8.00393584661531e-06,
+      "loss": 1.5739,
       "step": 1525
     },
     {
+      "epoch": 6.175298804780876,
+      "grad_norm": 0.244140625,
+      "learning_rate": 7.664532167567864e-06,
+      "loss": 1.5734,
       "step": 1550
     },
     {
+      "epoch": 6.274900398406374,
+      "grad_norm": 0.33984375,
+      "learning_rate": 7.327951271194699e-06,
+      "loss": 1.5746,
       "step": 1575
     },
     {
+      "epoch": 6.374501992031872,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 6.994599968803408e-06,
+      "loss": 1.5741,
       "step": 1600
     },
     {
+      "epoch": 6.474103585657371,
+      "grad_norm": 0.36328125,
+      "learning_rate": 6.664881168225894e-06,
+      "loss": 1.5743,
       "step": 1625
     },
     {
+      "epoch": 6.573705179282869,
+      "grad_norm": 0.41796875,
+      "learning_rate": 6.339193386840445e-06,
+      "loss": 1.5739,
       "step": 1650
     },
     {
+      "epoch": 6.673306772908367,
+      "grad_norm": 0.265625,
+      "learning_rate": 6.017930269900377e-06,
+      "loss": 1.5755,
       "step": 1675
     },
     {
+      "epoch": 6.772908366533865,
+      "grad_norm": 0.267578125,
+      "learning_rate": 5.701480114751432e-06,
+      "loss": 1.574,
       "step": 1700
     },
     {
+      "epoch": 6.872509960159363,
+      "grad_norm": 0.26171875,
+      "learning_rate": 5.390225401512923e-06,
+      "loss": 1.5747,
       "step": 1725
     },
     {
+      "epoch": 6.972111553784861,
+      "grad_norm": 0.314453125,
+      "learning_rate": 5.084542330789988e-06,
+      "loss": 1.5752,
       "step": 1750
     },
     {
+      "epoch": 7.0,
+      "eval_loss": 1.5802167654037476,
+      "eval_runtime": 7.2108,
+      "eval_samples_per_second": 69.617,
+      "eval_steps_per_second": 8.737,
+      "step": 1757
     },
     {
+      "epoch": 7.0717131474103585,
+      "grad_norm": 0.392578125,
+      "learning_rate": 4.784800368975557e-06,
+      "loss": 1.5742,
+      "step": 1775
     },
     {
+      "epoch": 7.171314741035856,
+      "grad_norm": 0.302734375,
+      "learning_rate": 4.491361801691717e-06,
+      "loss": 1.5747,
       "step": 1800
     },
     {
+      "epoch": 7.270916334661354,
+      "grad_norm": 0.25,
+      "learning_rate": 4.204581295910207e-06,
+      "loss": 1.5749,
       "step": 1825
     },
     {
+      "epoch": 7.370517928286852,
+      "grad_norm": 0.380859375,
+      "learning_rate": 3.924805471281184e-06,
+      "loss": 1.574,
       "step": 1850
     },
     {
+      "epoch": 7.47011952191235,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 3.652372481188512e-06,
+      "loss": 1.5742,
       "step": 1875
     },
     {
+      "epoch": 7.569721115537849,
+      "grad_norm": 0.23046875,
+      "learning_rate": 3.387611604037848e-06,
+      "loss": 1.574,
       "step": 1900
     },
     {
+      "epoch": 7.669322709163347,
+      "grad_norm": 0.322265625,
+      "learning_rate": 3.1308428452715643e-06,
+      "loss": 1.5747,
       "step": 1925
     },
     {
+      "epoch": 7.768924302788845,
+      "grad_norm": 0.1953125,
+      "learning_rate": 2.8823765505914827e-06,
+      "loss": 1.5743,
       "step": 1950
     },
     {
+      "epoch": 7.868525896414343,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 2.642513030856955e-06,
+      "loss": 1.5742,
       "step": 1975
     },
     {
+      "epoch": 7.968127490039841,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 2.4115421991116605e-06,
+      "loss": 1.574,
       "step": 2000
     },
     {
+      "epoch": 8.0,
+      "eval_loss": 1.5801949501037598,
+      "eval_runtime": 7.4448,
+      "eval_samples_per_second": 67.43,
+      "eval_steps_per_second": 8.462,
+      "step": 2008
+    },
+    {
+      "epoch": 8.06772908366534,
+      "grad_norm": 0.375,
+      "learning_rate": 2.1897432201777614e-06,
+      "loss": 1.5744,
       "step": 2025
     },
     {
+      "epoch": 8.167330677290837,
+      "grad_norm": 0.216796875,
+      "learning_rate": 1.977384173241027e-06,
+      "loss": 1.574,
       "step": 2050
     },
     {
+      "epoch": 8.266932270916335,
+      "grad_norm": 0.21875,
+      "learning_rate": 1.774721727834684e-06,
+      "loss": 1.5749,
       "step": 2075
     },
     {
+      "epoch": 8.366533864541832,
+      "grad_norm": 0.240234375,
+      "learning_rate": 1.5820008336136462e-06,
+      "loss": 1.5743,
       "step": 2100
     },
     {
+      "epoch": 8.466135458167331,
+      "grad_norm": 0.173828125,
+      "learning_rate": 1.3994544242940777e-06,
+      "loss": 1.5736,
       "step": 2125
     },
     {
+      "epoch": 8.565737051792828,
+      "grad_norm": 0.275390625,
+      "learning_rate": 1.2273031361160958e-06,
+      "loss": 1.5739,
       "step": 2150
     },
     {
+      "epoch": 8.665338645418327,
+      "grad_norm": 0.2265625,
+      "learning_rate": 1.0657550411699623e-06,
+      "loss": 1.5744,
       "step": 2175
     },
     {
+      "epoch": 8.764940239043824,
+      "grad_norm": 0.3515625,
+      "learning_rate": 9.150053959080008e-07,
+      "loss": 1.5743,
       "step": 2200
     },
     {
+      "epoch": 8.864541832669323,
+      "grad_norm": 0.322265625,
+      "learning_rate": 7.75236405146258e-07,
+      "loss": 1.5741,
       "step": 2225
     },
     {
+      "epoch": 8.964143426294822,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 6.466170018411422e-07,
+      "loss": 1.5743,
       "step": 2250
     },
     {
+      "epoch": 9.0,
+      "eval_loss": 1.5801681280136108,
+      "eval_runtime": 7.6077,
+      "eval_samples_per_second": 65.986,
+      "eval_steps_per_second": 8.281,
+      "step": 2259
+    },
+    {
+      "epoch": 9.063745019920319,
+      "grad_norm": 0.19921875,
+      "learning_rate": 5.293026429071857e-07,
+      "loss": 1.5739,
       "step": 2275
     },
     {
+      "epoch": 9.163346613545817,
+      "grad_norm": 0.29296875,
+      "learning_rate": 4.2343512132276055e-07,
+      "loss": 1.5742,
       "step": 2300
     },
     {
+      "epoch": 9.262948207171315,
+      "grad_norm": 0.275390625,
+      "learning_rate": 3.2914239475079655e-07,
+      "loss": 1.5749,
       "step": 2325
     },
     {
+      "epoch": 9.362549800796813,
+      "grad_norm": 0.251953125,
+      "learning_rate": 2.4653843088170206e-07,
+      "loss": 1.5744,
       "step": 2350
     },
     {
+      "epoch": 9.46215139442231,
+      "grad_norm": 0.27734375,
+      "learning_rate": 1.757230696853518e-07,
+      "loss": 1.5739,
       "step": 2375
     },
     {
+      "epoch": 9.56175298804781,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 1.1678190273868073e-07,
+      "loss": 1.5741,
       "step": 2400
     },
     {
+      "epoch": 9.661354581673306,
+      "grad_norm": 0.251953125,
+      "learning_rate": 6.978616977470708e-08,
+      "loss": 1.574,
       "step": 2425
     },
     {
+      "epoch": 9.760956175298805,
+      "grad_norm": 0.466796875,
+      "learning_rate": 3.4792672578038974e-08,
+      "loss": 1.5745,
       "step": 2450
     },
     {
+      "epoch": 9.860557768924302,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 1.184370633092291e-08,
+      "loss": 1.5735,
       "step": 2475
     },
     {
+      "epoch": 9.9601593625498,
+      "grad_norm": 0.197265625,
+      "learning_rate": 9.670084928137258e-10,
+      "loss": 1.5736,
       "step": 2500
     },
     {
+      "epoch": 10.0,
+      "eval_loss": 1.580127239227295,
+      "eval_runtime": 9.705,
+      "eval_samples_per_second": 51.726,
+      "eval_steps_per_second": 6.491,
+      "step": 2510
     }
   ],
   "logging_steps": 25,
+  "max_steps": 2510,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 500,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 9.82463661353902e+16,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42d76ebcb9158241976952c09114f0be4f135c1e43ad1f55f4156fb783882cac
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:c99f0fd7499710736e0323507b1752026d9637c5b6b5a6b6798a0a89f4b7d2dd
 size 5368