Checkpoint at step 1000 (FP16)

Browse files

Files changed (14) hide show

checkpoint-1000/config.json +38 -0
checkpoint-1000/generation_config.json +4 -0
checkpoint-1000/merges.txt +0 -0
checkpoint-1000/model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scaler.pt +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +30 -0
checkpoint-1000/tokenizer.json +0 -0
checkpoint-1000/tokenizer_config.json +23 -0
checkpoint-1000/trainer_state.json +749 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1000/vocab.json +0 -0

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 896,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.53.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 50257
+}

checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.53.2"
+}

checkpoint-1000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67eb6d181f582ee4d85fcb514ab6f3f159b3029479e5a1f80404ce8c11e6136d
+size 138721680

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45ba53398aa2480a679fa2b7e587dcdfc0bb347eeb9e7a280f1989c460ea03d5
+size 277500235

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645

checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa7e5f7f3366b7db6f25a9e3fa739116674bd641ce589a5940ff73f382fec0c
+size 1383

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dc8e131ce3fea54b5bbb10599e4aa96737710b736d49fec16ee3b6ee02cacfb
+size 1465

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,749 @@

+{
+  "best_global_step": 1000,
+  "best_metric": 4.4797258377075195,
+  "best_model_checkpoint": "./qwen3-30m-tinystories-checkpoints/checkpoint-1000",
+  "epoch": 0.05850690381465013,
+  "eval_steps": 1000,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5.850690381465013e-05,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0,
+      "loss": 11.3909,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005850690381465013,
+      "grad_norm": 74.58138275146484,
+      "learning_rate": 6.825273010920438e-08,
+      "loss": 11.4747,
+      "step": 10
+    },
+    {
+      "epoch": 0.0011701380762930026,
+      "grad_norm": 70.98603057861328,
+      "learning_rate": 1.6575663026521062e-07,
+      "loss": 11.4193,
+      "step": 20
+    },
+    {
+      "epoch": 0.0017552071144395038,
+      "grad_norm": 67.07854461669922,
+      "learning_rate": 2.632605304212169e-07,
+      "loss": 11.3702,
+      "step": 30
+    },
+    {
+      "epoch": 0.002340276152586005,
+      "grad_norm": 65.53945922851562,
+      "learning_rate": 3.6076443057722313e-07,
+      "loss": 11.1989,
+      "step": 40
+    },
+    {
+      "epoch": 0.0029253451907325064,
+      "grad_norm": 56.668155670166016,
+      "learning_rate": 4.582683307332293e-07,
+      "loss": 10.894,
+      "step": 50
+    },
+    {
+      "epoch": 0.0035104142288790077,
+      "grad_norm": 47.96501159667969,
+      "learning_rate": 5.557722308892356e-07,
+      "loss": 10.6112,
+      "step": 60
+    },
+    {
+      "epoch": 0.004095483267025509,
+      "grad_norm": 36.578582763671875,
+      "learning_rate": 6.532761310452418e-07,
+      "loss": 10.2903,
+      "step": 70
+    },
+    {
+      "epoch": 0.00468055230517201,
+      "grad_norm": 30.915164947509766,
+      "learning_rate": 7.507800312012481e-07,
+      "loss": 9.9734,
+      "step": 80
+    },
+    {
+      "epoch": 0.0052656213433185115,
+      "grad_norm": 24.249662399291992,
+      "learning_rate": 8.482839313572544e-07,
+      "loss": 9.7175,
+      "step": 90
+    },
+    {
+      "epoch": 0.005850690381465013,
+      "grad_norm": 17.660844802856445,
+      "learning_rate": 9.457878315132606e-07,
+      "loss": 9.4751,
+      "step": 100
+    },
+    {
+      "epoch": 0.006435759419611514,
+      "grad_norm": 14.554573059082031,
+      "learning_rate": 1.043291731669267e-06,
+      "loss": 9.2603,
+      "step": 110
+    },
+    {
+      "epoch": 0.007020828457758015,
+      "grad_norm": 12.098219871520996,
+      "learning_rate": 1.140795631825273e-06,
+      "loss": 9.0319,
+      "step": 120
+    },
+    {
+      "epoch": 0.007605897495904517,
+      "grad_norm": 10.675163269042969,
+      "learning_rate": 1.2382995319812794e-06,
+      "loss": 8.8657,
+      "step": 130
+    },
+    {
+      "epoch": 0.008190966534051018,
+      "grad_norm": 9.221511840820312,
+      "learning_rate": 1.3358034321372856e-06,
+      "loss": 8.6798,
+      "step": 140
+    },
+    {
+      "epoch": 0.00877603557219752,
+      "grad_norm": 8.275217056274414,
+      "learning_rate": 1.4333073322932917e-06,
+      "loss": 8.5072,
+      "step": 150
+    },
+    {
+      "epoch": 0.00936110461034402,
+      "grad_norm": 7.335099220275879,
+      "learning_rate": 1.5308112324492981e-06,
+      "loss": 8.3525,
+      "step": 160
+    },
+    {
+      "epoch": 0.009946173648490523,
+      "grad_norm": 6.6783833503723145,
+      "learning_rate": 1.6283151326053041e-06,
+      "loss": 8.2143,
+      "step": 170
+    },
+    {
+      "epoch": 0.010531242686637023,
+      "grad_norm": 5.890490531921387,
+      "learning_rate": 1.7258190327613103e-06,
+      "loss": 8.1013,
+      "step": 180
+    },
+    {
+      "epoch": 0.011116311724783525,
+      "grad_norm": 5.453526496887207,
+      "learning_rate": 1.823322932917317e-06,
+      "loss": 7.9714,
+      "step": 190
+    },
+    {
+      "epoch": 0.011701380762930026,
+      "grad_norm": 4.6703081130981445,
+      "learning_rate": 1.920826833073323e-06,
+      "loss": 7.8631,
+      "step": 200
+    },
+    {
+      "epoch": 0.012286449801076528,
+      "grad_norm": 4.414166450500488,
+      "learning_rate": 2.0183307332293293e-06,
+      "loss": 7.7655,
+      "step": 210
+    },
+    {
+      "epoch": 0.012871518839223028,
+      "grad_norm": 4.081826686859131,
+      "learning_rate": 2.1158346333853355e-06,
+      "loss": 7.6717,
+      "step": 220
+    },
+    {
+      "epoch": 0.01345658787736953,
+      "grad_norm": 4.213140487670898,
+      "learning_rate": 2.2133385335413417e-06,
+      "loss": 7.5924,
+      "step": 230
+    },
+    {
+      "epoch": 0.01404165691551603,
+      "grad_norm": 3.859170436859131,
+      "learning_rate": 2.310842433697348e-06,
+      "loss": 7.511,
+      "step": 240
+    },
+    {
+      "epoch": 0.014626725953662533,
+      "grad_norm": 3.9012818336486816,
+      "learning_rate": 2.408346333853354e-06,
+      "loss": 7.4278,
+      "step": 250
+    },
+    {
+      "epoch": 0.015211794991809033,
+      "grad_norm": 3.8075311183929443,
+      "learning_rate": 2.5058502340093603e-06,
+      "loss": 7.3492,
+      "step": 260
+    },
+    {
+      "epoch": 0.015796864029955535,
+      "grad_norm": 3.5343337059020996,
+      "learning_rate": 2.603354134165367e-06,
+      "loss": 7.2719,
+      "step": 270
+    },
+    {
+      "epoch": 0.016381933068102036,
+      "grad_norm": 3.5580379962921143,
+      "learning_rate": 2.700858034321373e-06,
+      "loss": 7.2373,
+      "step": 280
+    },
+    {
+      "epoch": 0.016967002106248536,
+      "grad_norm": 3.4421544075012207,
+      "learning_rate": 2.7983619344773792e-06,
+      "loss": 7.1697,
+      "step": 290
+    },
+    {
+      "epoch": 0.01755207114439504,
+      "grad_norm": 3.7167348861694336,
+      "learning_rate": 2.8958658346333854e-06,
+      "loss": 7.0948,
+      "step": 300
+    },
+    {
+      "epoch": 0.01813714018254154,
+      "grad_norm": 4.353510856628418,
+      "learning_rate": 2.993369734789392e-06,
+      "loss": 7.0373,
+      "step": 310
+    },
+    {
+      "epoch": 0.01872220922068804,
+      "grad_norm": 4.191625118255615,
+      "learning_rate": 3.0908736349453982e-06,
+      "loss": 6.9602,
+      "step": 320
+    },
+    {
+      "epoch": 0.01930727825883454,
+      "grad_norm": 4.205361843109131,
+      "learning_rate": 3.188377535101404e-06,
+      "loss": 6.903,
+      "step": 330
+    },
+    {
+      "epoch": 0.019892347296981045,
+      "grad_norm": 4.100222110748291,
+      "learning_rate": 3.28588143525741e-06,
+      "loss": 6.8439,
+      "step": 340
+    },
+    {
+      "epoch": 0.020477416335127546,
+      "grad_norm": 4.314079284667969,
+      "learning_rate": 3.3833853354134164e-06,
+      "loss": 6.7787,
+      "step": 350
+    },
+    {
+      "epoch": 0.021062485373274046,
+      "grad_norm": 6.652833938598633,
+      "learning_rate": 3.4808892355694226e-06,
+      "loss": 6.7226,
+      "step": 360
+    },
+    {
+      "epoch": 0.021647554411420546,
+      "grad_norm": 4.056787014007568,
+      "learning_rate": 3.5783931357254296e-06,
+      "loss": 6.664,
+      "step": 370
+    },
+    {
+      "epoch": 0.02223262344956705,
+      "grad_norm": 4.815715312957764,
+      "learning_rate": 3.675897035881436e-06,
+      "loss": 6.6125,
+      "step": 380
+    },
+    {
+      "epoch": 0.02281769248771355,
+      "grad_norm": 4.876301288604736,
+      "learning_rate": 3.773400936037442e-06,
+      "loss": 6.5621,
+      "step": 390
+    },
+    {
+      "epoch": 0.02340276152586005,
+      "grad_norm": 4.692805290222168,
+      "learning_rate": 3.870904836193448e-06,
+      "loss": 6.5075,
+      "step": 400
+    },
+    {
+      "epoch": 0.02398783056400655,
+      "grad_norm": 4.293951034545898,
+      "learning_rate": 3.968408736349454e-06,
+      "loss": 6.4574,
+      "step": 410
+    },
+    {
+      "epoch": 0.024572899602153055,
+      "grad_norm": 4.072322845458984,
+      "learning_rate": 4.06591263650546e-06,
+      "loss": 6.3977,
+      "step": 420
+    },
+    {
+      "epoch": 0.025157968640299556,
+      "grad_norm": 5.009401321411133,
+      "learning_rate": 4.163416536661466e-06,
+      "loss": 6.3332,
+      "step": 430
+    },
+    {
+      "epoch": 0.025743037678446056,
+      "grad_norm": 4.7960286140441895,
+      "learning_rate": 4.2609204368174725e-06,
+      "loss": 6.2778,
+      "step": 440
+    },
+    {
+      "epoch": 0.026328106716592557,
+      "grad_norm": 3.423039197921753,
+      "learning_rate": 4.358424336973479e-06,
+      "loss": 6.2403,
+      "step": 450
+    },
+    {
+      "epoch": 0.02691317575473906,
+      "grad_norm": 4.546107292175293,
+      "learning_rate": 4.455928237129486e-06,
+      "loss": 6.1845,
+      "step": 460
+    },
+    {
+      "epoch": 0.02749824479288556,
+      "grad_norm": 5.357204914093018,
+      "learning_rate": 4.553432137285492e-06,
+      "loss": 6.1027,
+      "step": 470
+    },
+    {
+      "epoch": 0.02808331383103206,
+      "grad_norm": 4.221200942993164,
+      "learning_rate": 4.650936037441498e-06,
+      "loss": 6.0522,
+      "step": 480
+    },
+    {
+      "epoch": 0.02866838286917856,
+      "grad_norm": 3.5187366008758545,
+      "learning_rate": 4.748439937597504e-06,
+      "loss": 6.0063,
+      "step": 490
+    },
+    {
+      "epoch": 0.029253451907325066,
+      "grad_norm": 4.514864444732666,
+      "learning_rate": 4.8459438377535105e-06,
+      "loss": 5.9795,
+      "step": 500
+    },
+    {
+      "epoch": 0.029838520945471566,
+      "grad_norm": 3.3947811126708984,
+      "learning_rate": 4.943447737909517e-06,
+      "loss": 5.9023,
+      "step": 510
+    },
+    {
+      "epoch": 0.030423589983618066,
+      "grad_norm": 5.161691665649414,
+      "learning_rate": 5.040951638065523e-06,
+      "loss": 5.8573,
+      "step": 520
+    },
+    {
+      "epoch": 0.031008659021764567,
+      "grad_norm": 5.400602340698242,
+      "learning_rate": 5.138455538221529e-06,
+      "loss": 5.8028,
+      "step": 530
+    },
+    {
+      "epoch": 0.03159372805991107,
+      "grad_norm": 4.410223007202148,
+      "learning_rate": 5.235959438377535e-06,
+      "loss": 5.7999,
+      "step": 540
+    },
+    {
+      "epoch": 0.03217879709805757,
+      "grad_norm": 3.708657741546631,
+      "learning_rate": 5.3334633385335414e-06,
+      "loss": 5.7253,
+      "step": 550
+    },
+    {
+      "epoch": 0.03276386613620407,
+      "grad_norm": 5.148003101348877,
+      "learning_rate": 5.430967238689548e-06,
+      "loss": 5.6967,
+      "step": 560
+    },
+    {
+      "epoch": 0.03334893517435057,
+      "grad_norm": 5.288032054901123,
+      "learning_rate": 5.528471138845554e-06,
+      "loss": 5.6411,
+      "step": 570
+    },
+    {
+      "epoch": 0.03393400421249707,
+      "grad_norm": 3.524038314819336,
+      "learning_rate": 5.62597503900156e-06,
+      "loss": 5.6076,
+      "step": 580
+    },
+    {
+      "epoch": 0.03451907325064357,
+      "grad_norm": 3.5057692527770996,
+      "learning_rate": 5.723478939157566e-06,
+      "loss": 5.5643,
+      "step": 590
+    },
+    {
+      "epoch": 0.03510414228879008,
+      "grad_norm": 4.593623161315918,
+      "learning_rate": 5.820982839313572e-06,
+      "loss": 5.5258,
+      "step": 600
+    },
+    {
+      "epoch": 0.03568921132693658,
+      "grad_norm": 3.617720127105713,
+      "learning_rate": 5.918486739469579e-06,
+      "loss": 5.4787,
+      "step": 610
+    },
+    {
+      "epoch": 0.03627428036508308,
+      "grad_norm": 3.6132466793060303,
+      "learning_rate": 6.015990639625586e-06,
+      "loss": 5.4579,
+      "step": 620
+    },
+    {
+      "epoch": 0.03685934940322958,
+      "grad_norm": 4.541604518890381,
+      "learning_rate": 6.113494539781592e-06,
+      "loss": 5.4188,
+      "step": 630
+    },
+    {
+      "epoch": 0.03744441844137608,
+      "grad_norm": 4.4366254806518555,
+      "learning_rate": 6.210998439937598e-06,
+      "loss": 5.3733,
+      "step": 640
+    },
+    {
+      "epoch": 0.03802948747952258,
+      "grad_norm": 4.232390403747559,
+      "learning_rate": 6.308502340093604e-06,
+      "loss": 5.3406,
+      "step": 650
+    },
+    {
+      "epoch": 0.03861455651766908,
+      "grad_norm": 3.4591336250305176,
+      "learning_rate": 6.4060062402496095e-06,
+      "loss": 5.2955,
+      "step": 660
+    },
+    {
+      "epoch": 0.03919962555581558,
+      "grad_norm": 5.510738849639893,
+      "learning_rate": 6.5035101404056166e-06,
+      "loss": 5.2574,
+      "step": 670
+    },
+    {
+      "epoch": 0.03978469459396209,
+      "grad_norm": 5.914499282836914,
+      "learning_rate": 6.601014040561624e-06,
+      "loss": 5.2101,
+      "step": 680
+    },
+    {
+      "epoch": 0.04036976363210859,
+      "grad_norm": 4.961345672607422,
+      "learning_rate": 6.698517940717629e-06,
+      "loss": 5.2078,
+      "step": 690
+    },
+    {
+      "epoch": 0.04095483267025509,
+      "grad_norm": 4.458784580230713,
+      "learning_rate": 6.796021840873636e-06,
+      "loss": 5.1342,
+      "step": 700
+    },
+    {
+      "epoch": 0.04153990170840159,
+      "grad_norm": 4.344489574432373,
+      "learning_rate": 6.893525741029641e-06,
+      "loss": 5.1115,
+      "step": 710
+    },
+    {
+      "epoch": 0.04212497074654809,
+      "grad_norm": 3.6247169971466064,
+      "learning_rate": 6.991029641185648e-06,
+      "loss": 5.1027,
+      "step": 720
+    },
+    {
+      "epoch": 0.04271003978469459,
+      "grad_norm": 4.234716415405273,
+      "learning_rate": 7.088533541341654e-06,
+      "loss": 5.0847,
+      "step": 730
+    },
+    {
+      "epoch": 0.04329510882284109,
+      "grad_norm": 3.6908812522888184,
+      "learning_rate": 7.18603744149766e-06,
+      "loss": 5.0225,
+      "step": 740
+    },
+    {
+      "epoch": 0.04388017786098759,
+      "grad_norm": 4.514861583709717,
+      "learning_rate": 7.283541341653667e-06,
+      "loss": 5.0108,
+      "step": 750
+    },
+    {
+      "epoch": 0.0444652468991341,
+      "grad_norm": 4.391042709350586,
+      "learning_rate": 7.381045241809672e-06,
+      "loss": 4.9743,
+      "step": 760
+    },
+    {
+      "epoch": 0.0450503159372806,
+      "grad_norm": 4.511202335357666,
+      "learning_rate": 7.478549141965679e-06,
+      "loss": 4.964,
+      "step": 770
+    },
+    {
+      "epoch": 0.0456353849754271,
+      "grad_norm": 4.672698497772217,
+      "learning_rate": 7.576053042121685e-06,
+      "loss": 4.9322,
+      "step": 780
+    },
+    {
+      "epoch": 0.0462204540135736,
+      "grad_norm": 5.7832207679748535,
+      "learning_rate": 7.673556942277693e-06,
+      "loss": 4.9004,
+      "step": 790
+    },
+    {
+      "epoch": 0.0468055230517201,
+      "grad_norm": 5.457101345062256,
+      "learning_rate": 7.771060842433697e-06,
+      "loss": 4.8727,
+      "step": 800
+    },
+    {
+      "epoch": 0.0473905920898666,
+      "grad_norm": 5.284400463104248,
+      "learning_rate": 7.868564742589705e-06,
+      "loss": 4.8428,
+      "step": 810
+    },
+    {
+      "epoch": 0.0479756611280131,
+      "grad_norm": 5.2293829917907715,
+      "learning_rate": 7.96606864274571e-06,
+      "loss": 4.8219,
+      "step": 820
+    },
+    {
+      "epoch": 0.04856073016615961,
+      "grad_norm": 4.629702091217041,
+      "learning_rate": 8.063572542901716e-06,
+      "loss": 4.7985,
+      "step": 830
+    },
+    {
+      "epoch": 0.04914579920430611,
+      "grad_norm": 4.912006378173828,
+      "learning_rate": 8.161076443057723e-06,
+      "loss": 4.8071,
+      "step": 840
+    },
+    {
+      "epoch": 0.04973086824245261,
+      "grad_norm": 4.5750346183776855,
+      "learning_rate": 8.258580343213728e-06,
+      "loss": 4.7561,
+      "step": 850
+    },
+    {
+      "epoch": 0.05031593728059911,
+      "grad_norm": 4.587753772735596,
+      "learning_rate": 8.356084243369736e-06,
+      "loss": 4.7323,
+      "step": 860
+    },
+    {
+      "epoch": 0.05090100631874561,
+      "grad_norm": 4.676549434661865,
+      "learning_rate": 8.45358814352574e-06,
+      "loss": 4.6787,
+      "step": 870
+    },
+    {
+      "epoch": 0.05148607535689211,
+      "grad_norm": 4.039685249328613,
+      "learning_rate": 8.551092043681748e-06,
+      "loss": 4.6808,
+      "step": 880
+    },
+    {
+      "epoch": 0.05207114439503861,
+      "grad_norm": 4.448736667633057,
+      "learning_rate": 8.648595943837753e-06,
+      "loss": 4.6849,
+      "step": 890
+    },
+    {
+      "epoch": 0.05265621343318511,
+      "grad_norm": 4.613405227661133,
+      "learning_rate": 8.74609984399376e-06,
+      "loss": 4.6309,
+      "step": 900
+    },
+    {
+      "epoch": 0.05324128247133162,
+      "grad_norm": 4.352312088012695,
+      "learning_rate": 8.843603744149765e-06,
+      "loss": 4.6045,
+      "step": 910
+    },
+    {
+      "epoch": 0.05382635150947812,
+      "grad_norm": 7.403280258178711,
+      "learning_rate": 8.941107644305773e-06,
+      "loss": 4.6065,
+      "step": 920
+    },
+    {
+      "epoch": 0.05441142054762462,
+      "grad_norm": 6.342307090759277,
+      "learning_rate": 9.03861154446178e-06,
+      "loss": 4.5865,
+      "step": 930
+    },
+    {
+      "epoch": 0.05499648958577112,
+      "grad_norm": 4.280608177185059,
+      "learning_rate": 9.136115444617785e-06,
+      "loss": 4.5471,
+      "step": 940
+    },
+    {
+      "epoch": 0.05558155862391762,
+      "grad_norm": 5.916503429412842,
+      "learning_rate": 9.233619344773792e-06,
+      "loss": 4.5704,
+      "step": 950
+    },
+    {
+      "epoch": 0.05616662766206412,
+      "grad_norm": 5.4643940925598145,
+      "learning_rate": 9.331123244929798e-06,
+      "loss": 4.5332,
+      "step": 960
+    },
+    {
+      "epoch": 0.05675169670021062,
+      "grad_norm": 6.90775728225708,
+      "learning_rate": 9.428627145085804e-06,
+      "loss": 4.5466,
+      "step": 970
+    },
+    {
+      "epoch": 0.05733676573835712,
+      "grad_norm": 4.458855152130127,
+      "learning_rate": 9.52613104524181e-06,
+      "loss": 4.4947,
+      "step": 980
+    },
+    {
+      "epoch": 0.05792183477650363,
+      "grad_norm": 4.308935642242432,
+      "learning_rate": 9.623634945397816e-06,
+      "loss": 4.4815,
+      "step": 990
+    },
+    {
+      "epoch": 0.05850690381465013,
+      "grad_norm": 4.152543544769287,
+      "learning_rate": 9.721138845553823e-06,
+      "loss": 4.4704,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05850690381465013,
+      "eval_loss": 4.4797258377075195,
+      "eval_runtime": 32.8353,
+      "eval_samples_per_second": 687.889,
+      "eval_steps_per_second": 5.391,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 51276,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9201055137398784.0,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ab409182da5adbcb18002f22d82b6427b8d967f738680c7220c631231212807
+size 5841

checkpoint-1000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff