invi-bhagyesh commited on Oct 11, 2025

Commit

567fb4d

verified ·

1 Parent(s): 3f70c6d

Upload folder using huggingface_hub

Browse files

Files changed (48) hide show

distilgpt2/checkpoint-900/config.json +45 -0
distilgpt2/checkpoint-900/generation_config.json +6 -0
distilgpt2/checkpoint-900/merges.txt +0 -0
distilgpt2/checkpoint-900/model.safetensors +3 -0
distilgpt2/checkpoint-900/optimizer.pt +3 -0
distilgpt2/checkpoint-900/rng_state.pth +3 -0
distilgpt2/checkpoint-900/scheduler.pt +3 -0
distilgpt2/checkpoint-900/special_tokens_map.json +6 -0
distilgpt2/checkpoint-900/tokenizer.json +0 -0
distilgpt2/checkpoint-900/tokenizer_config.json +21 -0
distilgpt2/checkpoint-900/trainer_state.json +664 -0
distilgpt2/checkpoint-900/training_args.bin +3 -0
distilgpt2/checkpoint-900/vocab.json +0 -0
distilgpt2/checkpoint-939/config.json +45 -0
distilgpt2/checkpoint-939/generation_config.json +6 -0
distilgpt2/checkpoint-939/merges.txt +0 -0
distilgpt2/checkpoint-939/model.safetensors +3 -0
distilgpt2/checkpoint-939/optimizer.pt +3 -0
distilgpt2/checkpoint-939/rng_state.pth +3 -0
distilgpt2/checkpoint-939/scheduler.pt +3 -0
distilgpt2/checkpoint-939/special_tokens_map.json +6 -0
distilgpt2/checkpoint-939/tokenizer.json +0 -0
distilgpt2/checkpoint-939/tokenizer_config.json +21 -0
distilgpt2/checkpoint-939/trainer_state.json +685 -0
distilgpt2/checkpoint-939/training_args.bin +3 -0
distilgpt2/checkpoint-939/vocab.json +0 -0
distilgpt2/fp32_finetuned_model/config.json +45 -0
distilgpt2/fp32_finetuned_model/generation_config.json +6 -0
distilgpt2/fp32_finetuned_model/merges.txt +0 -0
distilgpt2/fp32_finetuned_model/model.safetensors +3 -0
distilgpt2/fp32_finetuned_model/special_tokens_map.json +6 -0
distilgpt2/fp32_finetuned_model/tokenizer.json +0 -0
distilgpt2/fp32_finetuned_model/tokenizer_config.json +21 -0
distilgpt2/fp32_finetuned_model/training_args.bin +3 -0
distilgpt2/fp32_finetuned_model/vocab.json +0 -0
distilgpt2/fp32_model/model.pt +3 -0
distilgpt2/int4_model/model.pt +3 -0
distilgpt2/qat_model/config.json +45 -0
distilgpt2/qat_model/generation_config.json +6 -0
distilgpt2/qat_model/merges.txt +0 -0
distilgpt2/qat_model/model.safetensors +3 -0
distilgpt2/qat_model/special_tokens_map.json +6 -0
distilgpt2/qat_model/tokenizer.json +0 -0
distilgpt2/qat_model/tokenizer_config.json +21 -0
distilgpt2/qat_model/training_args.bin +3 -0
distilgpt2/qat_model/vocab.json +0 -0
distilgpt2/runs/Oct11_16-56-47_5a2111a52440/events.out.tfevents.1760201808.5a2111a52440.96.0 +3 -0
distilgpt2/runs/Oct11_16-56-47_5a2111a52440/events.out.tfevents.1760203459.5a2111a52440.96.1 +3 -0

distilgpt2/checkpoint-900/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

distilgpt2/checkpoint-900/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.3"
+}

distilgpt2/checkpoint-900/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/checkpoint-900/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4156d69a8bbb0eed81f66098024baba021c49a1bbf6c7cc291713d882a0a2663
+size 327657928

distilgpt2/checkpoint-900/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b1e018a98acc59b197197cb2144e3047862b56f0d23827ad8ca508221bc95be
+size 655364474

distilgpt2/checkpoint-900/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:018c226caeb64f7d9e8d90042a28fb98b683c4520b26ddb99a2d2cf5835d83c6
+size 14244

distilgpt2/checkpoint-900/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:678712356fafda5888bf10ab4c93223826a90f603ff85cc4c0c5fa5cef5f7aff
+size 1064

distilgpt2/checkpoint-900/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/checkpoint-900/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/checkpoint-900/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/checkpoint-900/trainer_state.json ADDED Viewed

	@@ -0,0 +1,664 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.8768000000000002,
+  "eval_steps": 50,
+  "global_step": 900,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 5.848097801208496,
+      "learning_rate": 4.5e-06,
+      "loss": 2.3184,
+      "step": 10
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 2.5511839389801025,
+      "learning_rate": 9.5e-06,
+      "loss": 2.0441,
+      "step": 20
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 2.325101613998413,
+      "learning_rate": 1.45e-05,
+      "loss": 1.917,
+      "step": 30
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 2.8172621726989746,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 2.2506,
+      "step": 40
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.325876474380493,
+      "learning_rate": 2.45e-05,
+      "loss": 1.8721,
+      "step": 50
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 2.435955286026001,
+      "learning_rate": 2.95e-05,
+      "loss": 1.8327,
+      "step": 60
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 2.8088173866271973,
+      "learning_rate": 3.45e-05,
+      "loss": 1.7227,
+      "step": 70
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 2.5641565322875977,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.8172,
+      "step": 80
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 2.741063356399536,
+      "learning_rate": 4.4500000000000004e-05,
+      "loss": 1.6577,
+      "step": 90
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.7209696769714355,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.8833,
+      "step": 100
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 2.6947555541992188,
+      "learning_rate": 4.946364719904649e-05,
+      "loss": 2.0177,
+      "step": 110
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 2.37589955329895,
+      "learning_rate": 4.886769964243147e-05,
+      "loss": 1.8127,
+      "step": 120
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 3.018338203430176,
+      "learning_rate": 4.8271752085816455e-05,
+      "loss": 1.862,
+      "step": 130
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 3.863445997238159,
+      "learning_rate": 4.767580452920143e-05,
+      "loss": 1.9697,
+      "step": 140
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.984334945678711,
+      "learning_rate": 4.7079856972586414e-05,
+      "loss": 1.9533,
+      "step": 150
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 2.4165594577789307,
+      "learning_rate": 4.64839094159714e-05,
+      "loss": 1.8285,
+      "step": 160
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.7296801805496216,
+      "learning_rate": 4.5887961859356374e-05,
+      "loss": 1.783,
+      "step": 170
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 2.5170071125030518,
+      "learning_rate": 4.529201430274136e-05,
+      "loss": 1.5282,
+      "step": 180
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 2.53623366355896,
+      "learning_rate": 4.469606674612634e-05,
+      "loss": 1.8379,
+      "step": 190
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.4525668621063232,
+      "learning_rate": 4.410011918951132e-05,
+      "loss": 1.9569,
+      "step": 200
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 3.1210453510284424,
+      "learning_rate": 4.3504171632896306e-05,
+      "loss": 1.8645,
+      "step": 210
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 3.257766008377075,
+      "learning_rate": 4.290822407628129e-05,
+      "loss": 1.7785,
+      "step": 220
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 2.6239209175109863,
+      "learning_rate": 4.231227651966627e-05,
+      "loss": 1.7526,
+      "step": 230
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 2.526646137237549,
+      "learning_rate": 4.1716328963051256e-05,
+      "loss": 1.8174,
+      "step": 240
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.1927671432495117,
+      "learning_rate": 4.112038140643624e-05,
+      "loss": 1.5208,
+      "step": 250
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 3.186937093734741,
+      "learning_rate": 4.052443384982122e-05,
+      "loss": 1.6424,
+      "step": 260
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 2.3542070388793945,
+      "learning_rate": 3.9928486293206205e-05,
+      "loss": 1.7575,
+      "step": 270
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.9437086582183838,
+      "learning_rate": 3.933253873659118e-05,
+      "loss": 1.5789,
+      "step": 280
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 2.5914506912231445,
+      "learning_rate": 3.8736591179976165e-05,
+      "loss": 1.6693,
+      "step": 290
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.9983558654785156,
+      "learning_rate": 3.814064362336114e-05,
+      "loss": 1.7447,
+      "step": 300
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 1.7978435754776,
+      "learning_rate": 3.7544696066746124e-05,
+      "loss": 1.6711,
+      "step": 310
+    },
+    {
+      "epoch": 1.0224,
+      "grad_norm": 3.406076192855835,
+      "learning_rate": 3.694874851013111e-05,
+      "loss": 1.8133,
+      "step": 320
+    },
+    {
+      "epoch": 1.0544,
+      "grad_norm": 2.2293291091918945,
+      "learning_rate": 3.635280095351609e-05,
+      "loss": 1.7722,
+      "step": 330
+    },
+    {
+      "epoch": 1.0864,
+      "grad_norm": 2.3838605880737305,
+      "learning_rate": 3.575685339690107e-05,
+      "loss": 1.6334,
+      "step": 340
+    },
+    {
+      "epoch": 1.1184,
+      "grad_norm": 2.256012439727783,
+      "learning_rate": 3.5160905840286057e-05,
+      "loss": 1.5336,
+      "step": 350
+    },
+    {
+      "epoch": 1.1504,
+      "grad_norm": 2.4280765056610107,
+      "learning_rate": 3.456495828367104e-05,
+      "loss": 1.6229,
+      "step": 360
+    },
+    {
+      "epoch": 1.1824,
+      "grad_norm": 2.8488781452178955,
+      "learning_rate": 3.396901072705602e-05,
+      "loss": 1.7856,
+      "step": 370
+    },
+    {
+      "epoch": 1.2144,
+      "grad_norm": 1.915912389755249,
+      "learning_rate": 3.3373063170441006e-05,
+      "loss": 1.6749,
+      "step": 380
+    },
+    {
+      "epoch": 1.2464,
+      "grad_norm": 2.3185014724731445,
+      "learning_rate": 3.277711561382599e-05,
+      "loss": 1.4804,
+      "step": 390
+    },
+    {
+      "epoch": 1.2784,
+      "grad_norm": 3.116079330444336,
+      "learning_rate": 3.218116805721097e-05,
+      "loss": 1.7211,
+      "step": 400
+    },
+    {
+      "epoch": 1.3104,
+      "grad_norm": 2.4169015884399414,
+      "learning_rate": 3.158522050059595e-05,
+      "loss": 1.9021,
+      "step": 410
+    },
+    {
+      "epoch": 1.3424,
+      "grad_norm": 2.3938851356506348,
+      "learning_rate": 3.098927294398093e-05,
+      "loss": 1.8511,
+      "step": 420
+    },
+    {
+      "epoch": 1.3744,
+      "grad_norm": 2.8537979125976562,
+      "learning_rate": 3.039332538736591e-05,
+      "loss": 1.7034,
+      "step": 430
+    },
+    {
+      "epoch": 1.4064,
+      "grad_norm": 2.3688974380493164,
+      "learning_rate": 2.9797377830750894e-05,
+      "loss": 1.5986,
+      "step": 440
+    },
+    {
+      "epoch": 1.4384000000000001,
+      "grad_norm": 2.4708006381988525,
+      "learning_rate": 2.9201430274135878e-05,
+      "loss": 1.8078,
+      "step": 450
+    },
+    {
+      "epoch": 1.4704,
+      "grad_norm": 2.428288698196411,
+      "learning_rate": 2.860548271752086e-05,
+      "loss": 1.7737,
+      "step": 460
+    },
+    {
+      "epoch": 1.5024,
+      "grad_norm": 3.910275459289551,
+      "learning_rate": 2.800953516090584e-05,
+      "loss": 1.7495,
+      "step": 470
+    },
+    {
+      "epoch": 1.5344,
+      "grad_norm": 3.0172460079193115,
+      "learning_rate": 2.7413587604290824e-05,
+      "loss": 1.514,
+      "step": 480
+    },
+    {
+      "epoch": 1.5664,
+      "grad_norm": 2.250343084335327,
+      "learning_rate": 2.6817640047675807e-05,
+      "loss": 1.8304,
+      "step": 490
+    },
+    {
+      "epoch": 1.5984,
+      "grad_norm": 2.327152729034424,
+      "learning_rate": 2.622169249106079e-05,
+      "loss": 1.739,
+      "step": 500
+    },
+    {
+      "epoch": 1.6303999999999998,
+      "grad_norm": 2.144254207611084,
+      "learning_rate": 2.5625744934445773e-05,
+      "loss": 1.6757,
+      "step": 510
+    },
+    {
+      "epoch": 1.6623999999999999,
+      "grad_norm": 3.2096481323242188,
+      "learning_rate": 2.5029797377830756e-05,
+      "loss": 1.4808,
+      "step": 520
+    },
+    {
+      "epoch": 1.6944,
+      "grad_norm": 2.584341287612915,
+      "learning_rate": 2.4433849821215736e-05,
+      "loss": 1.4193,
+      "step": 530
+    },
+    {
+      "epoch": 1.7264,
+      "grad_norm": 1.8545327186584473,
+      "learning_rate": 2.3837902264600716e-05,
+      "loss": 1.5282,
+      "step": 540
+    },
+    {
+      "epoch": 1.7584,
+      "grad_norm": 2.4782474040985107,
+      "learning_rate": 2.32419547079857e-05,
+      "loss": 1.7681,
+      "step": 550
+    },
+    {
+      "epoch": 1.7904,
+      "grad_norm": 3.0083882808685303,
+      "learning_rate": 2.264600715137068e-05,
+      "loss": 1.8547,
+      "step": 560
+    },
+    {
+      "epoch": 1.8224,
+      "grad_norm": 2.600146770477295,
+      "learning_rate": 2.205005959475566e-05,
+      "loss": 1.55,
+      "step": 570
+    },
+    {
+      "epoch": 1.8544,
+      "grad_norm": 1.716131567955017,
+      "learning_rate": 2.1454112038140645e-05,
+      "loss": 1.7406,
+      "step": 580
+    },
+    {
+      "epoch": 1.8864,
+      "grad_norm": 2.536386728286743,
+      "learning_rate": 2.0858164481525628e-05,
+      "loss": 1.6436,
+      "step": 590
+    },
+    {
+      "epoch": 1.9184,
+      "grad_norm": 2.161496877670288,
+      "learning_rate": 2.026221692491061e-05,
+      "loss": 1.6616,
+      "step": 600
+    },
+    {
+      "epoch": 1.9504000000000001,
+      "grad_norm": 2.6042492389678955,
+      "learning_rate": 1.966626936829559e-05,
+      "loss": 1.6278,
+      "step": 610
+    },
+    {
+      "epoch": 1.9824000000000002,
+      "grad_norm": 2.641090154647827,
+      "learning_rate": 1.907032181168057e-05,
+      "loss": 1.5632,
+      "step": 620
+    },
+    {
+      "epoch": 2.0128,
+      "grad_norm": 2.133544683456421,
+      "learning_rate": 1.8474374255065554e-05,
+      "loss": 1.6812,
+      "step": 630
+    },
+    {
+      "epoch": 2.0448,
+      "grad_norm": 3.285512685775757,
+      "learning_rate": 1.7878426698450537e-05,
+      "loss": 1.6949,
+      "step": 640
+    },
+    {
+      "epoch": 2.0768,
+      "grad_norm": 2.3074097633361816,
+      "learning_rate": 1.728247914183552e-05,
+      "loss": 1.6071,
+      "step": 650
+    },
+    {
+      "epoch": 2.1088,
+      "grad_norm": 3.0742709636688232,
+      "learning_rate": 1.6686531585220503e-05,
+      "loss": 1.7419,
+      "step": 660
+    },
+    {
+      "epoch": 2.1408,
+      "grad_norm": 1.7113871574401855,
+      "learning_rate": 1.6090584028605486e-05,
+      "loss": 1.4946,
+      "step": 670
+    },
+    {
+      "epoch": 2.1728,
+      "grad_norm": 2.1010677814483643,
+      "learning_rate": 1.5494636471990466e-05,
+      "loss": 1.4342,
+      "step": 680
+    },
+    {
+      "epoch": 2.2048,
+      "grad_norm": 2.1286814212799072,
+      "learning_rate": 1.4898688915375447e-05,
+      "loss": 1.5922,
+      "step": 690
+    },
+    {
+      "epoch": 2.2368,
+      "grad_norm": 2.790449619293213,
+      "learning_rate": 1.430274135876043e-05,
+      "loss": 1.8599,
+      "step": 700
+    },
+    {
+      "epoch": 2.2688,
+      "grad_norm": 2.589710235595703,
+      "learning_rate": 1.3706793802145412e-05,
+      "loss": 1.5151,
+      "step": 710
+    },
+    {
+      "epoch": 2.3008,
+      "grad_norm": 2.7718470096588135,
+      "learning_rate": 1.3110846245530395e-05,
+      "loss": 1.6952,
+      "step": 720
+    },
+    {
+      "epoch": 2.3327999999999998,
+      "grad_norm": 2.431128978729248,
+      "learning_rate": 1.2514898688915378e-05,
+      "loss": 1.5283,
+      "step": 730
+    },
+    {
+      "epoch": 2.3648,
+      "grad_norm": 2.2655253410339355,
+      "learning_rate": 1.1918951132300358e-05,
+      "loss": 1.6497,
+      "step": 740
+    },
+    {
+      "epoch": 2.3968,
+      "grad_norm": 2.075089454650879,
+      "learning_rate": 1.132300357568534e-05,
+      "loss": 1.6415,
+      "step": 750
+    },
+    {
+      "epoch": 2.4288,
+      "grad_norm": 2.5529932975769043,
+      "learning_rate": 1.0727056019070322e-05,
+      "loss": 1.5808,
+      "step": 760
+    },
+    {
+      "epoch": 2.4608,
+      "grad_norm": 1.4980545043945312,
+      "learning_rate": 1.0131108462455305e-05,
+      "loss": 1.5812,
+      "step": 770
+    },
+    {
+      "epoch": 2.4928,
+      "grad_norm": 1.8248558044433594,
+      "learning_rate": 9.535160905840285e-06,
+      "loss": 1.4732,
+      "step": 780
+    },
+    {
+      "epoch": 2.5248,
+      "grad_norm": 3.1333248615264893,
+      "learning_rate": 8.939213349225268e-06,
+      "loss": 1.6248,
+      "step": 790
+    },
+    {
+      "epoch": 2.5568,
+      "grad_norm": 2.8021600246429443,
+      "learning_rate": 8.343265792610251e-06,
+      "loss": 1.6294,
+      "step": 800
+    },
+    {
+      "epoch": 2.5888,
+      "grad_norm": 1.942338466644287,
+      "learning_rate": 7.747318235995233e-06,
+      "loss": 1.5882,
+      "step": 810
+    },
+    {
+      "epoch": 2.6208,
+      "grad_norm": 2.3626883029937744,
+      "learning_rate": 7.151370679380215e-06,
+      "loss": 1.8162,
+      "step": 820
+    },
+    {
+      "epoch": 2.6528,
+      "grad_norm": 2.5083396434783936,
+      "learning_rate": 6.5554231227651975e-06,
+      "loss": 1.7253,
+      "step": 830
+    },
+    {
+      "epoch": 2.6848,
+      "grad_norm": 2.0188353061676025,
+      "learning_rate": 5.959475566150179e-06,
+      "loss": 1.7959,
+      "step": 840
+    },
+    {
+      "epoch": 2.7168,
+      "grad_norm": 2.74120831489563,
+      "learning_rate": 5.363528009535161e-06,
+      "loss": 1.6135,
+      "step": 850
+    },
+    {
+      "epoch": 2.7488,
+      "grad_norm": 2.245814561843872,
+      "learning_rate": 4.767580452920143e-06,
+      "loss": 1.6453,
+      "step": 860
+    },
+    {
+      "epoch": 2.7808,
+      "grad_norm": 2.572443962097168,
+      "learning_rate": 4.171632896305126e-06,
+      "loss": 1.6977,
+      "step": 870
+    },
+    {
+      "epoch": 2.8128,
+      "grad_norm": 2.1156558990478516,
+      "learning_rate": 3.5756853396901076e-06,
+      "loss": 1.5791,
+      "step": 880
+    },
+    {
+      "epoch": 2.8448,
+      "grad_norm": 2.4948208332061768,
+      "learning_rate": 2.9797377830750894e-06,
+      "loss": 1.6606,
+      "step": 890
+    },
+    {
+      "epoch": 2.8768000000000002,
+      "grad_norm": 2.791877269744873,
+      "learning_rate": 2.3837902264600713e-06,
+      "loss": 1.5517,
+      "step": 900
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 939,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3758492460515328.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

distilgpt2/checkpoint-900/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03869acb65d97d306a6961c4204630b6685180f03ac4c51eaa6ded2aad61a5e8
+size 5368

distilgpt2/checkpoint-900/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/checkpoint-939/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

distilgpt2/checkpoint-939/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.3"
+}

distilgpt2/checkpoint-939/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/checkpoint-939/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b9577ecb05b6c1e4859b0f7cce7ae902d9f5295f60ad68a01c27e67ec0345ca
+size 327657928

distilgpt2/checkpoint-939/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42110f9712bcdb6178b1e72ea0dd0ffd2239fea06bf950f5d6ebacb627847379
+size 655364474

distilgpt2/checkpoint-939/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e924ffd74e9e97176ade4c2113cc94c63ba2bdd44fd3b33a9d611bcd794c7d
+size 14244

distilgpt2/checkpoint-939/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13d1f941d497036c9605187ce99bc7102dc62a0065d5cfc7ec062dec8be90fb2
+size 1064

distilgpt2/checkpoint-939/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/checkpoint-939/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/checkpoint-939/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/checkpoint-939/trainer_state.json ADDED Viewed

	@@ -0,0 +1,685 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 50,
+  "global_step": 939,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 5.848097801208496,
+      "learning_rate": 4.5e-06,
+      "loss": 2.3184,
+      "step": 10
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 2.5511839389801025,
+      "learning_rate": 9.5e-06,
+      "loss": 2.0441,
+      "step": 20
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 2.325101613998413,
+      "learning_rate": 1.45e-05,
+      "loss": 1.917,
+      "step": 30
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 2.8172621726989746,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 2.2506,
+      "step": 40
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 3.325876474380493,
+      "learning_rate": 2.45e-05,
+      "loss": 1.8721,
+      "step": 50
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 2.435955286026001,
+      "learning_rate": 2.95e-05,
+      "loss": 1.8327,
+      "step": 60
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 2.8088173866271973,
+      "learning_rate": 3.45e-05,
+      "loss": 1.7227,
+      "step": 70
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 2.5641565322875977,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.8172,
+      "step": 80
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 2.741063356399536,
+      "learning_rate": 4.4500000000000004e-05,
+      "loss": 1.6577,
+      "step": 90
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.7209696769714355,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.8833,
+      "step": 100
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 2.6947555541992188,
+      "learning_rate": 4.946364719904649e-05,
+      "loss": 2.0177,
+      "step": 110
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 2.37589955329895,
+      "learning_rate": 4.886769964243147e-05,
+      "loss": 1.8127,
+      "step": 120
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 3.018338203430176,
+      "learning_rate": 4.8271752085816455e-05,
+      "loss": 1.862,
+      "step": 130
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 3.863445997238159,
+      "learning_rate": 4.767580452920143e-05,
+      "loss": 1.9697,
+      "step": 140
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 2.984334945678711,
+      "learning_rate": 4.7079856972586414e-05,
+      "loss": 1.9533,
+      "step": 150
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 2.4165594577789307,
+      "learning_rate": 4.64839094159714e-05,
+      "loss": 1.8285,
+      "step": 160
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 1.7296801805496216,
+      "learning_rate": 4.5887961859356374e-05,
+      "loss": 1.783,
+      "step": 170
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 2.5170071125030518,
+      "learning_rate": 4.529201430274136e-05,
+      "loss": 1.5282,
+      "step": 180
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 2.53623366355896,
+      "learning_rate": 4.469606674612634e-05,
+      "loss": 1.8379,
+      "step": 190
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.4525668621063232,
+      "learning_rate": 4.410011918951132e-05,
+      "loss": 1.9569,
+      "step": 200
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 3.1210453510284424,
+      "learning_rate": 4.3504171632896306e-05,
+      "loss": 1.8645,
+      "step": 210
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 3.257766008377075,
+      "learning_rate": 4.290822407628129e-05,
+      "loss": 1.7785,
+      "step": 220
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 2.6239209175109863,
+      "learning_rate": 4.231227651966627e-05,
+      "loss": 1.7526,
+      "step": 230
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 2.526646137237549,
+      "learning_rate": 4.1716328963051256e-05,
+      "loss": 1.8174,
+      "step": 240
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.1927671432495117,
+      "learning_rate": 4.112038140643624e-05,
+      "loss": 1.5208,
+      "step": 250
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 3.186937093734741,
+      "learning_rate": 4.052443384982122e-05,
+      "loss": 1.6424,
+      "step": 260
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 2.3542070388793945,
+      "learning_rate": 3.9928486293206205e-05,
+      "loss": 1.7575,
+      "step": 270
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 1.9437086582183838,
+      "learning_rate": 3.933253873659118e-05,
+      "loss": 1.5789,
+      "step": 280
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 2.5914506912231445,
+      "learning_rate": 3.8736591179976165e-05,
+      "loss": 1.6693,
+      "step": 290
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.9983558654785156,
+      "learning_rate": 3.814064362336114e-05,
+      "loss": 1.7447,
+      "step": 300
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 1.7978435754776,
+      "learning_rate": 3.7544696066746124e-05,
+      "loss": 1.6711,
+      "step": 310
+    },
+    {
+      "epoch": 1.0224,
+      "grad_norm": 3.406076192855835,
+      "learning_rate": 3.694874851013111e-05,
+      "loss": 1.8133,
+      "step": 320
+    },
+    {
+      "epoch": 1.0544,
+      "grad_norm": 2.2293291091918945,
+      "learning_rate": 3.635280095351609e-05,
+      "loss": 1.7722,
+      "step": 330
+    },
+    {
+      "epoch": 1.0864,
+      "grad_norm": 2.3838605880737305,
+      "learning_rate": 3.575685339690107e-05,
+      "loss": 1.6334,
+      "step": 340
+    },
+    {
+      "epoch": 1.1184,
+      "grad_norm": 2.256012439727783,
+      "learning_rate": 3.5160905840286057e-05,
+      "loss": 1.5336,
+      "step": 350
+    },
+    {
+      "epoch": 1.1504,
+      "grad_norm": 2.4280765056610107,
+      "learning_rate": 3.456495828367104e-05,
+      "loss": 1.6229,
+      "step": 360
+    },
+    {
+      "epoch": 1.1824,
+      "grad_norm": 2.8488781452178955,
+      "learning_rate": 3.396901072705602e-05,
+      "loss": 1.7856,
+      "step": 370
+    },
+    {
+      "epoch": 1.2144,
+      "grad_norm": 1.915912389755249,
+      "learning_rate": 3.3373063170441006e-05,
+      "loss": 1.6749,
+      "step": 380
+    },
+    {
+      "epoch": 1.2464,
+      "grad_norm": 2.3185014724731445,
+      "learning_rate": 3.277711561382599e-05,
+      "loss": 1.4804,
+      "step": 390
+    },
+    {
+      "epoch": 1.2784,
+      "grad_norm": 3.116079330444336,
+      "learning_rate": 3.218116805721097e-05,
+      "loss": 1.7211,
+      "step": 400
+    },
+    {
+      "epoch": 1.3104,
+      "grad_norm": 2.4169015884399414,
+      "learning_rate": 3.158522050059595e-05,
+      "loss": 1.9021,
+      "step": 410
+    },
+    {
+      "epoch": 1.3424,
+      "grad_norm": 2.3938851356506348,
+      "learning_rate": 3.098927294398093e-05,
+      "loss": 1.8511,
+      "step": 420
+    },
+    {
+      "epoch": 1.3744,
+      "grad_norm": 2.8537979125976562,
+      "learning_rate": 3.039332538736591e-05,
+      "loss": 1.7034,
+      "step": 430
+    },
+    {
+      "epoch": 1.4064,
+      "grad_norm": 2.3688974380493164,
+      "learning_rate": 2.9797377830750894e-05,
+      "loss": 1.5986,
+      "step": 440
+    },
+    {
+      "epoch": 1.4384000000000001,
+      "grad_norm": 2.4708006381988525,
+      "learning_rate": 2.9201430274135878e-05,
+      "loss": 1.8078,
+      "step": 450
+    },
+    {
+      "epoch": 1.4704,
+      "grad_norm": 2.428288698196411,
+      "learning_rate": 2.860548271752086e-05,
+      "loss": 1.7737,
+      "step": 460
+    },
+    {
+      "epoch": 1.5024,
+      "grad_norm": 3.910275459289551,
+      "learning_rate": 2.800953516090584e-05,
+      "loss": 1.7495,
+      "step": 470
+    },
+    {
+      "epoch": 1.5344,
+      "grad_norm": 3.0172460079193115,
+      "learning_rate": 2.7413587604290824e-05,
+      "loss": 1.514,
+      "step": 480
+    },
+    {
+      "epoch": 1.5664,
+      "grad_norm": 2.250343084335327,
+      "learning_rate": 2.6817640047675807e-05,
+      "loss": 1.8304,
+      "step": 490
+    },
+    {
+      "epoch": 1.5984,
+      "grad_norm": 2.327152729034424,
+      "learning_rate": 2.622169249106079e-05,
+      "loss": 1.739,
+      "step": 500
+    },
+    {
+      "epoch": 1.6303999999999998,
+      "grad_norm": 2.144254207611084,
+      "learning_rate": 2.5625744934445773e-05,
+      "loss": 1.6757,
+      "step": 510
+    },
+    {
+      "epoch": 1.6623999999999999,
+      "grad_norm": 3.2096481323242188,
+      "learning_rate": 2.5029797377830756e-05,
+      "loss": 1.4808,
+      "step": 520
+    },
+    {
+      "epoch": 1.6944,
+      "grad_norm": 2.584341287612915,
+      "learning_rate": 2.4433849821215736e-05,
+      "loss": 1.4193,
+      "step": 530
+    },
+    {
+      "epoch": 1.7264,
+      "grad_norm": 1.8545327186584473,
+      "learning_rate": 2.3837902264600716e-05,
+      "loss": 1.5282,
+      "step": 540
+    },
+    {
+      "epoch": 1.7584,
+      "grad_norm": 2.4782474040985107,
+      "learning_rate": 2.32419547079857e-05,
+      "loss": 1.7681,
+      "step": 550
+    },
+    {
+      "epoch": 1.7904,
+      "grad_norm": 3.0083882808685303,
+      "learning_rate": 2.264600715137068e-05,
+      "loss": 1.8547,
+      "step": 560
+    },
+    {
+      "epoch": 1.8224,
+      "grad_norm": 2.600146770477295,
+      "learning_rate": 2.205005959475566e-05,
+      "loss": 1.55,
+      "step": 570
+    },
+    {
+      "epoch": 1.8544,
+      "grad_norm": 1.716131567955017,
+      "learning_rate": 2.1454112038140645e-05,
+      "loss": 1.7406,
+      "step": 580
+    },
+    {
+      "epoch": 1.8864,
+      "grad_norm": 2.536386728286743,
+      "learning_rate": 2.0858164481525628e-05,
+      "loss": 1.6436,
+      "step": 590
+    },
+    {
+      "epoch": 1.9184,
+      "grad_norm": 2.161496877670288,
+      "learning_rate": 2.026221692491061e-05,
+      "loss": 1.6616,
+      "step": 600
+    },
+    {
+      "epoch": 1.9504000000000001,
+      "grad_norm": 2.6042492389678955,
+      "learning_rate": 1.966626936829559e-05,
+      "loss": 1.6278,
+      "step": 610
+    },
+    {
+      "epoch": 1.9824000000000002,
+      "grad_norm": 2.641090154647827,
+      "learning_rate": 1.907032181168057e-05,
+      "loss": 1.5632,
+      "step": 620
+    },
+    {
+      "epoch": 2.0128,
+      "grad_norm": 2.133544683456421,
+      "learning_rate": 1.8474374255065554e-05,
+      "loss": 1.6812,
+      "step": 630
+    },
+    {
+      "epoch": 2.0448,
+      "grad_norm": 3.285512685775757,
+      "learning_rate": 1.7878426698450537e-05,
+      "loss": 1.6949,
+      "step": 640
+    },
+    {
+      "epoch": 2.0768,
+      "grad_norm": 2.3074097633361816,
+      "learning_rate": 1.728247914183552e-05,
+      "loss": 1.6071,
+      "step": 650
+    },
+    {
+      "epoch": 2.1088,
+      "grad_norm": 3.0742709636688232,
+      "learning_rate": 1.6686531585220503e-05,
+      "loss": 1.7419,
+      "step": 660
+    },
+    {
+      "epoch": 2.1408,
+      "grad_norm": 1.7113871574401855,
+      "learning_rate": 1.6090584028605486e-05,
+      "loss": 1.4946,
+      "step": 670
+    },
+    {
+      "epoch": 2.1728,
+      "grad_norm": 2.1010677814483643,
+      "learning_rate": 1.5494636471990466e-05,
+      "loss": 1.4342,
+      "step": 680
+    },
+    {
+      "epoch": 2.2048,
+      "grad_norm": 2.1286814212799072,
+      "learning_rate": 1.4898688915375447e-05,
+      "loss": 1.5922,
+      "step": 690
+    },
+    {
+      "epoch": 2.2368,
+      "grad_norm": 2.790449619293213,
+      "learning_rate": 1.430274135876043e-05,
+      "loss": 1.8599,
+      "step": 700
+    },
+    {
+      "epoch": 2.2688,
+      "grad_norm": 2.589710235595703,
+      "learning_rate": 1.3706793802145412e-05,
+      "loss": 1.5151,
+      "step": 710
+    },
+    {
+      "epoch": 2.3008,
+      "grad_norm": 2.7718470096588135,
+      "learning_rate": 1.3110846245530395e-05,
+      "loss": 1.6952,
+      "step": 720
+    },
+    {
+      "epoch": 2.3327999999999998,
+      "grad_norm": 2.431128978729248,
+      "learning_rate": 1.2514898688915378e-05,
+      "loss": 1.5283,
+      "step": 730
+    },
+    {
+      "epoch": 2.3648,
+      "grad_norm": 2.2655253410339355,
+      "learning_rate": 1.1918951132300358e-05,
+      "loss": 1.6497,
+      "step": 740
+    },
+    {
+      "epoch": 2.3968,
+      "grad_norm": 2.075089454650879,
+      "learning_rate": 1.132300357568534e-05,
+      "loss": 1.6415,
+      "step": 750
+    },
+    {
+      "epoch": 2.4288,
+      "grad_norm": 2.5529932975769043,
+      "learning_rate": 1.0727056019070322e-05,
+      "loss": 1.5808,
+      "step": 760
+    },
+    {
+      "epoch": 2.4608,
+      "grad_norm": 1.4980545043945312,
+      "learning_rate": 1.0131108462455305e-05,
+      "loss": 1.5812,
+      "step": 770
+    },
+    {
+      "epoch": 2.4928,
+      "grad_norm": 1.8248558044433594,
+      "learning_rate": 9.535160905840285e-06,
+      "loss": 1.4732,
+      "step": 780
+    },
+    {
+      "epoch": 2.5248,
+      "grad_norm": 3.1333248615264893,
+      "learning_rate": 8.939213349225268e-06,
+      "loss": 1.6248,
+      "step": 790
+    },
+    {
+      "epoch": 2.5568,
+      "grad_norm": 2.8021600246429443,
+      "learning_rate": 8.343265792610251e-06,
+      "loss": 1.6294,
+      "step": 800
+    },
+    {
+      "epoch": 2.5888,
+      "grad_norm": 1.942338466644287,
+      "learning_rate": 7.747318235995233e-06,
+      "loss": 1.5882,
+      "step": 810
+    },
+    {
+      "epoch": 2.6208,
+      "grad_norm": 2.3626883029937744,
+      "learning_rate": 7.151370679380215e-06,
+      "loss": 1.8162,
+      "step": 820
+    },
+    {
+      "epoch": 2.6528,
+      "grad_norm": 2.5083396434783936,
+      "learning_rate": 6.5554231227651975e-06,
+      "loss": 1.7253,
+      "step": 830
+    },
+    {
+      "epoch": 2.6848,
+      "grad_norm": 2.0188353061676025,
+      "learning_rate": 5.959475566150179e-06,
+      "loss": 1.7959,
+      "step": 840
+    },
+    {
+      "epoch": 2.7168,
+      "grad_norm": 2.74120831489563,
+      "learning_rate": 5.363528009535161e-06,
+      "loss": 1.6135,
+      "step": 850
+    },
+    {
+      "epoch": 2.7488,
+      "grad_norm": 2.245814561843872,
+      "learning_rate": 4.767580452920143e-06,
+      "loss": 1.6453,
+      "step": 860
+    },
+    {
+      "epoch": 2.7808,
+      "grad_norm": 2.572443962097168,
+      "learning_rate": 4.171632896305126e-06,
+      "loss": 1.6977,
+      "step": 870
+    },
+    {
+      "epoch": 2.8128,
+      "grad_norm": 2.1156558990478516,
+      "learning_rate": 3.5756853396901076e-06,
+      "loss": 1.5791,
+      "step": 880
+    },
+    {
+      "epoch": 2.8448,
+      "grad_norm": 2.4948208332061768,
+      "learning_rate": 2.9797377830750894e-06,
+      "loss": 1.6606,
+      "step": 890
+    },
+    {
+      "epoch": 2.8768000000000002,
+      "grad_norm": 2.791877269744873,
+      "learning_rate": 2.3837902264600713e-06,
+      "loss": 1.5517,
+      "step": 900
+    },
+    {
+      "epoch": 2.9088000000000003,
+      "grad_norm": 3.131666898727417,
+      "learning_rate": 1.7878426698450538e-06,
+      "loss": 1.7854,
+      "step": 910
+    },
+    {
+      "epoch": 2.9408,
+      "grad_norm": 2.1511664390563965,
+      "learning_rate": 1.1918951132300357e-06,
+      "loss": 1.6168,
+      "step": 920
+    },
+    {
+      "epoch": 2.9728,
+      "grad_norm": 2.804823637008667,
+      "learning_rate": 5.959475566150178e-07,
+      "loss": 1.6385,
+      "step": 930
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 939,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3919451258880000.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

distilgpt2/checkpoint-939/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03869acb65d97d306a6961c4204630b6685180f03ac4c51eaa6ded2aad61a5e8
+size 5368

distilgpt2/checkpoint-939/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/fp32_finetuned_model/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

distilgpt2/fp32_finetuned_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.3"
+}

distilgpt2/fp32_finetuned_model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/fp32_finetuned_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b9577ecb05b6c1e4859b0f7cce7ae902d9f5295f60ad68a01c27e67ec0345ca
+size 327657928

distilgpt2/fp32_finetuned_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/fp32_finetuned_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/fp32_finetuned_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/fp32_finetuned_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03869acb65d97d306a6961c4204630b6685180f03ac4c51eaa6ded2aad61a5e8
+size 5368

distilgpt2/fp32_finetuned_model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/fp32_model/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fca0d5d3a30b01fb0894e0949cf9559738ebe028af2766d808678d1ba9490927
+size 327673810

distilgpt2/int4_model/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee10c37d273a07ea757353bb7396821369e9d6c299465fd814173c4692c48c05
+size 327673810

distilgpt2/qat_model/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_cache": true,
+  "vocab_size": 50257
+}

distilgpt2/qat_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.3"
+}

distilgpt2/qat_model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/qat_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fe1dc2dbb641de2ef011adf9216bf7947c0e5d99a1eac027e2dedb1f531f2fe
+size 327657928

distilgpt2/qat_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/qat_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/qat_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

distilgpt2/qat_model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03869acb65d97d306a6961c4204630b6685180f03ac4c51eaa6ded2aad61a5e8
+size 5368

distilgpt2/qat_model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilgpt2/runs/Oct11_16-56-47_5a2111a52440/events.out.tfevents.1760201808.5a2111a52440.96.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88b06c52c70c2baec6efc1bed997f4cf734fa581835621ef1082bb1d2a34cb64
+size 25259

distilgpt2/runs/Oct11_16-56-47_5a2111a52440/events.out.tfevents.1760203459.5a2111a52440.96.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:183d3b389e1081bc2fec3274f251fef4a93cd8c528f8b1a191fb7566d2016d1b
+size 25259