Training in progress, step 1500, checkpoint

Browse files

Files changed (8) hide show

checkpoint-1500/config.json +40 -0
checkpoint-1500/generation_config.json +12 -0
checkpoint-1500/model.safetensors +3 -0
checkpoint-1500/optimizer.pt +3 -0
checkpoint-1500/rng_state.pth +3 -0
checkpoint-1500/scheduler.pt +3 -0
checkpoint-1500/trainer_state.json +1083 -0
checkpoint-1500/training_args.bin +3 -0

checkpoint-1500/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "vocab_size": 128256
+}

checkpoint-1500/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.46.3"
+}

checkpoint-1500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06bb2c4d5d91910dc814e641e48609741e7009865298f28595da2d30edd96016
+size 2471645608

checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:170cb434a03ed2fd43f282a8b8979745fad01db35f937af429e456eb745ff3e5
+size 3724602

checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5450255e6a9ab1eb65c5f2d4ad14e937c363f0fe9b8fbbad54746263bc4a8f15
+size 14244

checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b62a6982caa9b47207f64a7858dbd39e1fbcf0807c59221faa61c7a640f067d6
+size 1064

checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1083 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.09325,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005,
+      "grad_norm": 402.0,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 4.0977,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 51.0,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 4.0891,
+      "step": 20
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 50.0,
+      "learning_rate": 3e-06,
+      "loss": 3.4992,
+      "step": 30
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 23.5,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.9609,
+      "step": 40
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 16.5,
+      "learning_rate": 5e-06,
+      "loss": 3.1281,
+      "step": 50
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 20.375,
+      "learning_rate": 6e-06,
+      "loss": 3.0043,
+      "step": 60
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 18.125,
+      "learning_rate": 7e-06,
+      "loss": 2.8566,
+      "step": 70
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 42.0,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 3.1531,
+      "step": 80
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 17.25,
+      "learning_rate": 9e-06,
+      "loss": 2.6734,
+      "step": 90
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 13.6875,
+      "learning_rate": 1e-05,
+      "loss": 2.6406,
+      "step": 100
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 11.875,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 2.7305,
+      "step": 110
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 15.5,
+      "learning_rate": 1.2e-05,
+      "loss": 2.6008,
+      "step": 120
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 12.9375,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 2.5461,
+      "step": 130
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 14.4375,
+      "learning_rate": 1.4e-05,
+      "loss": 2.6531,
+      "step": 140
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 12.125,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 2.3348,
+      "step": 150
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 13.25,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 2.268,
+      "step": 160
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 13.4375,
+      "learning_rate": 1.7e-05,
+      "loss": 2.3805,
+      "step": 170
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 18.125,
+      "learning_rate": 1.8e-05,
+      "loss": 2.2859,
+      "step": 180
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 14.75,
+      "learning_rate": 1.9e-05,
+      "loss": 2.2238,
+      "step": 190
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 11.6875,
+      "learning_rate": 2e-05,
+      "loss": 2.2816,
+      "step": 200
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 18.875,
+      "learning_rate": 1.9998476951563914e-05,
+      "loss": 1.8926,
+      "step": 210
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 10.8125,
+      "learning_rate": 1.999390827019096e-05,
+      "loss": 2.0168,
+      "step": 220
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 30.0,
+      "learning_rate": 1.9986295347545738e-05,
+      "loss": 1.9996,
+      "step": 230
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 14.6875,
+      "learning_rate": 1.9975640502598243e-05,
+      "loss": 1.8109,
+      "step": 240
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 14.0625,
+      "learning_rate": 1.9961946980917457e-05,
+      "loss": 1.7777,
+      "step": 250
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 22.125,
+      "learning_rate": 1.9945218953682736e-05,
+      "loss": 1.7828,
+      "step": 260
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 28.75,
+      "learning_rate": 1.9925461516413224e-05,
+      "loss": 1.5926,
+      "step": 270
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 17.375,
+      "learning_rate": 1.9902680687415704e-05,
+      "loss": 1.7307,
+      "step": 280
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 14.4375,
+      "learning_rate": 1.9876883405951378e-05,
+      "loss": 1.6797,
+      "step": 290
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 45.0,
+      "learning_rate": 1.9848077530122083e-05,
+      "loss": 1.4248,
+      "step": 300
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 12.9375,
+      "learning_rate": 1.9816271834476642e-05,
+      "loss": 1.5828,
+      "step": 310
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 16.25,
+      "learning_rate": 1.9781476007338058e-05,
+      "loss": 1.3252,
+      "step": 320
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 15.9375,
+      "learning_rate": 1.9743700647852356e-05,
+      "loss": 1.3516,
+      "step": 330
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 38.75,
+      "learning_rate": 1.9702957262759964e-05,
+      "loss": 1.4135,
+      "step": 340
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 64.0,
+      "learning_rate": 1.9659258262890683e-05,
+      "loss": 1.3307,
+      "step": 350
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 109.0,
+      "learning_rate": 1.961261695938319e-05,
+      "loss": 1.2195,
+      "step": 360
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 11.3125,
+      "learning_rate": 1.9563047559630356e-05,
+      "loss": 1.2945,
+      "step": 370
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 11.0625,
+      "learning_rate": 1.9510565162951538e-05,
+      "loss": 1.1441,
+      "step": 380
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 36.0,
+      "learning_rate": 1.945518575599317e-05,
+      "loss": 1.3148,
+      "step": 390
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 31.0,
+      "learning_rate": 1.9396926207859085e-05,
+      "loss": 1.2035,
+      "step": 400
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 81.0,
+      "learning_rate": 1.9335804264972018e-05,
+      "loss": 1.1465,
+      "step": 410
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 77.0,
+      "learning_rate": 1.9271838545667876e-05,
+      "loss": 1.1883,
+      "step": 420
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 64.0,
+      "learning_rate": 1.9205048534524405e-05,
+      "loss": 1.1635,
+      "step": 430
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 12.3125,
+      "learning_rate": 1.913545457642601e-05,
+      "loss": 0.8765,
+      "step": 440
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 15.8125,
+      "learning_rate": 1.9063077870366504e-05,
+      "loss": 1.2404,
+      "step": 450
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 9.3125,
+      "learning_rate": 1.8987940462991673e-05,
+      "loss": 1.082,
+      "step": 460
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 111.0,
+      "learning_rate": 1.891006524188368e-05,
+      "loss": 0.749,
+      "step": 470
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 49.75,
+      "learning_rate": 1.8829475928589272e-05,
+      "loss": 1.0076,
+      "step": 480
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 16.625,
+      "learning_rate": 1.874619707139396e-05,
+      "loss": 0.9004,
+      "step": 490
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 34.75,
+      "learning_rate": 1.866025403784439e-05,
+      "loss": 0.7954,
+      "step": 500
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 80.0,
+      "learning_rate": 1.8571673007021124e-05,
+      "loss": 0.9604,
+      "step": 510
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 68.5,
+      "learning_rate": 1.848048096156426e-05,
+      "loss": 0.6887,
+      "step": 520
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 10.625,
+      "learning_rate": 1.8386705679454243e-05,
+      "loss": 0.7305,
+      "step": 530
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 7.59375,
+      "learning_rate": 1.8290375725550417e-05,
+      "loss": 0.6724,
+      "step": 540
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 9.0625,
+      "learning_rate": 1.819152044288992e-05,
+      "loss": 0.7522,
+      "step": 550
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 15.8125,
+      "learning_rate": 1.8090169943749477e-05,
+      "loss": 0.6078,
+      "step": 560
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 20.625,
+      "learning_rate": 1.798635510047293e-05,
+      "loss": 0.6214,
+      "step": 570
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 46.5,
+      "learning_rate": 1.788010753606722e-05,
+      "loss": 0.6955,
+      "step": 580
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 6.96875,
+      "learning_rate": 1.777145961456971e-05,
+      "loss": 0.5002,
+      "step": 590
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 9.875,
+      "learning_rate": 1.766044443118978e-05,
+      "loss": 0.7948,
+      "step": 600
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 6.125,
+      "learning_rate": 1.7547095802227723e-05,
+      "loss": 0.4094,
+      "step": 610
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 5.0625,
+      "learning_rate": 1.7431448254773943e-05,
+      "loss": 0.4871,
+      "step": 620
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 6.40625,
+      "learning_rate": 1.7313537016191706e-05,
+      "loss": 0.6022,
+      "step": 630
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 5.59375,
+      "learning_rate": 1.7193398003386514e-05,
+      "loss": 0.4998,
+      "step": 640
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 5.75,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 0.4213,
+      "step": 650
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 16.0,
+      "learning_rate": 1.6946583704589973e-05,
+      "loss": 0.5501,
+      "step": 660
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 20.875,
+      "learning_rate": 1.6819983600624986e-05,
+      "loss": 0.4,
+      "step": 670
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 8.0,
+      "learning_rate": 1.6691306063588583e-05,
+      "loss": 0.3687,
+      "step": 680
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 4.375,
+      "learning_rate": 1.6560590289905074e-05,
+      "loss": 0.4078,
+      "step": 690
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 4.78125,
+      "learning_rate": 1.6427876096865394e-05,
+      "loss": 0.3165,
+      "step": 700
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 32.25,
+      "learning_rate": 1.6293203910498375e-05,
+      "loss": 0.3724,
+      "step": 710
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 8.125,
+      "learning_rate": 1.6156614753256583e-05,
+      "loss": 0.3892,
+      "step": 720
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 15.0,
+      "learning_rate": 1.6018150231520486e-05,
+      "loss": 0.351,
+      "step": 730
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 5.46875,
+      "learning_rate": 1.5877852522924733e-05,
+      "loss": 0.4384,
+      "step": 740
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 13.125,
+      "learning_rate": 1.573576436351046e-05,
+      "loss": 0.346,
+      "step": 750
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 3.890625,
+      "learning_rate": 1.5591929034707468e-05,
+      "loss": 0.303,
+      "step": 760
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 15.375,
+      "learning_rate": 1.5446390350150272e-05,
+      "loss": 0.3484,
+      "step": 770
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 6.1875,
+      "learning_rate": 1.529919264233205e-05,
+      "loss": 0.286,
+      "step": 780
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 3.96875,
+      "learning_rate": 1.5150380749100545e-05,
+      "loss": 0.2787,
+      "step": 790
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 4.5625,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.2914,
+      "step": 800
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 16.125,
+      "learning_rate": 1.4848096202463373e-05,
+      "loss": 0.2648,
+      "step": 810
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 6.0,
+      "learning_rate": 1.469471562785891e-05,
+      "loss": 0.2437,
+      "step": 820
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 5.71875,
+      "learning_rate": 1.4539904997395468e-05,
+      "loss": 0.2669,
+      "step": 830
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 3.5625,
+      "learning_rate": 1.4383711467890776e-05,
+      "loss": 0.2553,
+      "step": 840
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 3.8125,
+      "learning_rate": 1.4226182617406996e-05,
+      "loss": 0.2304,
+      "step": 850
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 6.0625,
+      "learning_rate": 1.4067366430758004e-05,
+      "loss": 0.2303,
+      "step": 860
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 6.0,
+      "learning_rate": 1.3907311284892737e-05,
+      "loss": 0.2441,
+      "step": 870
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 5.71875,
+      "learning_rate": 1.3746065934159123e-05,
+      "loss": 0.24,
+      "step": 880
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 3.546875,
+      "learning_rate": 1.3583679495453e-05,
+      "loss": 0.2336,
+      "step": 890
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 3.140625,
+      "learning_rate": 1.342020143325669e-05,
+      "loss": 0.2136,
+      "step": 900
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 3.71875,
+      "learning_rate": 1.3255681544571568e-05,
+      "loss": 0.2241,
+      "step": 910
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 3.515625,
+      "learning_rate": 1.3090169943749475e-05,
+      "loss": 0.2322,
+      "step": 920
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 4.0625,
+      "learning_rate": 1.2923717047227368e-05,
+      "loss": 0.2061,
+      "step": 930
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 3.8125,
+      "learning_rate": 1.2756373558169992e-05,
+      "loss": 0.2297,
+      "step": 940
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 3.296875,
+      "learning_rate": 1.2588190451025209e-05,
+      "loss": 0.2035,
+      "step": 950
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 3.546875,
+      "learning_rate": 1.2419218955996677e-05,
+      "loss": 0.2047,
+      "step": 960
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 3.203125,
+      "learning_rate": 1.2249510543438652e-05,
+      "loss": 0.1935,
+      "step": 970
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 3.28125,
+      "learning_rate": 1.2079116908177592e-05,
+      "loss": 0.2045,
+      "step": 980
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 2.984375,
+      "learning_rate": 1.190808995376545e-05,
+      "loss": 0.1985,
+      "step": 990
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.9375,
+      "learning_rate": 1.1736481776669307e-05,
+      "loss": 0.1922,
+      "step": 1000
+    },
+    {
+      "epoch": 0.505,
+      "grad_norm": 3.140625,
+      "learning_rate": 1.156434465040231e-05,
+      "loss": 0.1796,
+      "step": 1010
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 3.46875,
+      "learning_rate": 1.1391731009600655e-05,
+      "loss": 0.1798,
+      "step": 1020
+    },
+    {
+      "epoch": 0.515,
+      "grad_norm": 2.984375,
+      "learning_rate": 1.1218693434051475e-05,
+      "loss": 0.173,
+      "step": 1030
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 3.171875,
+      "learning_rate": 1.1045284632676535e-05,
+      "loss": 0.1625,
+      "step": 1040
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 3.25,
+      "learning_rate": 1.0871557427476585e-05,
+      "loss": 0.1725,
+      "step": 1050
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 2.421875,
+      "learning_rate": 1.0697564737441254e-05,
+      "loss": 0.1833,
+      "step": 1060
+    },
+    {
+      "epoch": 0.535,
+      "grad_norm": 3.078125,
+      "learning_rate": 1.0523359562429441e-05,
+      "loss": 0.1812,
+      "step": 1070
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 2.765625,
+      "learning_rate": 1.0348994967025012e-05,
+      "loss": 0.1681,
+      "step": 1080
+    },
+    {
+      "epoch": 0.545,
+      "grad_norm": 3.046875,
+      "learning_rate": 1.0174524064372837e-05,
+      "loss": 0.176,
+      "step": 1090
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 3.296875,
+      "learning_rate": 1e-05,
+      "loss": 0.177,
+      "step": 1100
+    },
+    {
+      "epoch": 0.555,
+      "grad_norm": 3.703125,
+      "learning_rate": 9.825475935627165e-06,
+      "loss": 0.1714,
+      "step": 1110
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 2.9375,
+      "learning_rate": 9.651005032974994e-06,
+      "loss": 0.1514,
+      "step": 1120
+    },
+    {
+      "epoch": 0.565,
+      "grad_norm": 2.5,
+      "learning_rate": 9.476640437570562e-06,
+      "loss": 0.1479,
+      "step": 1130
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 3.28125,
+      "learning_rate": 9.302435262558748e-06,
+      "loss": 0.1638,
+      "step": 1140
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 2.734375,
+      "learning_rate": 9.128442572523418e-06,
+      "loss": 0.1538,
+      "step": 1150
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.65625,
+      "learning_rate": 8.954715367323468e-06,
+      "loss": 0.1554,
+      "step": 1160
+    },
+    {
+      "epoch": 0.585,
+      "grad_norm": 2.234375,
+      "learning_rate": 8.781306565948528e-06,
+      "loss": 0.1489,
+      "step": 1170
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.53125,
+      "learning_rate": 8.60826899039935e-06,
+      "loss": 0.1451,
+      "step": 1180
+    },
+    {
+      "epoch": 0.595,
+      "grad_norm": 3.65625,
+      "learning_rate": 8.43565534959769e-06,
+      "loss": 0.1394,
+      "step": 1190
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.859375,
+      "learning_rate": 8.263518223330698e-06,
+      "loss": 0.1447,
+      "step": 1200
+    },
+    {
+      "epoch": 0.605,
+      "grad_norm": 2.5,
+      "learning_rate": 8.091910046234552e-06,
+      "loss": 0.1331,
+      "step": 1210
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.78125,
+      "learning_rate": 7.92088309182241e-06,
+      "loss": 0.1393,
+      "step": 1220
+    },
+    {
+      "epoch": 0.615,
+      "grad_norm": 1.734375,
+      "learning_rate": 7.750489456561351e-06,
+      "loss": 0.1366,
+      "step": 1230
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.234375,
+      "learning_rate": 7.580781044003324e-06,
+      "loss": 0.1397,
+      "step": 1240
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 2.8125,
+      "learning_rate": 7.411809548974792e-06,
+      "loss": 0.144,
+      "step": 1250
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.75,
+      "learning_rate": 7.243626441830009e-06,
+      "loss": 0.1424,
+      "step": 1260
+    },
+    {
+      "epoch": 0.635,
+      "grad_norm": 2.34375,
+      "learning_rate": 7.076282952772634e-06,
+      "loss": 0.1373,
+      "step": 1270
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.5703125,
+      "learning_rate": 6.909830056250527e-06,
+      "loss": 0.141,
+      "step": 1280
+    },
+    {
+      "epoch": 0.645,
+      "grad_norm": 2.078125,
+      "learning_rate": 6.744318455428436e-06,
+      "loss": 0.1408,
+      "step": 1290
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.8671875,
+      "learning_rate": 6.579798566743314e-06,
+      "loss": 0.1423,
+      "step": 1300
+    },
+    {
+      "epoch": 0.655,
+      "grad_norm": 2.90625,
+      "learning_rate": 6.4163205045469975e-06,
+      "loss": 0.1425,
+      "step": 1310
+    },
+    {
+      "epoch": 1.00325,
+      "grad_norm": 2.078125,
+      "learning_rate": 6.25393406584088e-06,
+      "loss": 0.1304,
+      "step": 1320
+    },
+    {
+      "epoch": 1.00825,
+      "grad_norm": 1.453125,
+      "learning_rate": 6.092688715107265e-06,
+      "loss": 0.1265,
+      "step": 1330
+    },
+    {
+      "epoch": 1.01325,
+      "grad_norm": 2.8125,
+      "learning_rate": 5.932633569242e-06,
+      "loss": 0.1298,
+      "step": 1340
+    },
+    {
+      "epoch": 1.01825,
+      "grad_norm": 2.5625,
+      "learning_rate": 5.773817382593008e-06,
+      "loss": 0.1219,
+      "step": 1350
+    },
+    {
+      "epoch": 1.02325,
+      "grad_norm": 2.890625,
+      "learning_rate": 5.616288532109225e-06,
+      "loss": 0.1362,
+      "step": 1360
+    },
+    {
+      "epoch": 1.02825,
+      "grad_norm": 2.40625,
+      "learning_rate": 5.460095002604533e-06,
+      "loss": 0.1371,
+      "step": 1370
+    },
+    {
+      "epoch": 1.03325,
+      "grad_norm": 1.578125,
+      "learning_rate": 5.305284372141095e-06,
+      "loss": 0.132,
+      "step": 1380
+    },
+    {
+      "epoch": 1.03825,
+      "grad_norm": 2.421875,
+      "learning_rate": 5.151903797536631e-06,
+      "loss": 0.1396,
+      "step": 1390
+    },
+    {
+      "epoch": 1.04325,
+      "grad_norm": 1.9453125,
+      "learning_rate": 5.000000000000003e-06,
+      "loss": 0.1356,
+      "step": 1400
+    },
+    {
+      "epoch": 1.04825,
+      "grad_norm": 1.8359375,
+      "learning_rate": 4.849619250899458e-06,
+      "loss": 0.1284,
+      "step": 1410
+    },
+    {
+      "epoch": 1.05325,
+      "grad_norm": 2.03125,
+      "learning_rate": 4.700807357667953e-06,
+      "loss": 0.1302,
+      "step": 1420
+    },
+    {
+      "epoch": 1.05825,
+      "grad_norm": 1.6875,
+      "learning_rate": 4.5536096498497295e-06,
+      "loss": 0.1325,
+      "step": 1430
+    },
+    {
+      "epoch": 1.06325,
+      "grad_norm": 2.3125,
+      "learning_rate": 4.408070965292534e-06,
+      "loss": 0.1279,
+      "step": 1440
+    },
+    {
+      "epoch": 1.06825,
+      "grad_norm": 2.140625,
+      "learning_rate": 4.264235636489542e-06,
+      "loss": 0.1268,
+      "step": 1450
+    },
+    {
+      "epoch": 1.07325,
+      "grad_norm": 2.59375,
+      "learning_rate": 4.12214747707527e-06,
+      "loss": 0.1296,
+      "step": 1460
+    },
+    {
+      "epoch": 1.07825,
+      "grad_norm": 2.484375,
+      "learning_rate": 3.981849768479516e-06,
+      "loss": 0.1263,
+      "step": 1470
+    },
+    {
+      "epoch": 1.08325,
+      "grad_norm": 2.703125,
+      "learning_rate": 3.8433852467434175e-06,
+      "loss": 0.1232,
+      "step": 1480
+    },
+    {
+      "epoch": 1.08825,
+      "grad_norm": 1.6640625,
+      "learning_rate": 3.7067960895016277e-06,
+      "loss": 0.1212,
+      "step": 1490
+    },
+    {
+      "epoch": 1.09325,
+      "grad_norm": 1.8046875,
+      "learning_rate": 3.5721239031346067e-06,
+      "loss": 0.1177,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4346932754736742e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5e32038c45d54a6607b9dae189c2a754eb22b9c228b351719f8fba20e106598
+size 5368