Charlie81 commited on Sep 1, 2025

Commit

ccb26b5

1 Parent(s): bc05da9

delete checkpoints

Browse files

Files changed (22) hide show

checkpoints/checkpoint-200/config.json +0 -38
checkpoints/checkpoint-200/generation_config.json +0 -6
checkpoints/checkpoint-200/model-00001-of-00003.safetensors +0 -3
checkpoints/checkpoint-200/model-00002-of-00003.safetensors +0 -3
checkpoints/checkpoint-200/model-00003-of-00003.safetensors +0 -3
checkpoints/checkpoint-200/model.safetensors.index.json +0 -0
checkpoints/checkpoint-200/optimizer.pt +0 -3
checkpoints/checkpoint-200/rng_state.pth +0 -3
checkpoints/checkpoint-200/scheduler.pt +0 -3
checkpoints/checkpoint-200/trainer_state.json +0 -174
checkpoints/checkpoint-200/training_args.bin +0 -3
checkpoints/checkpoint-220/config.json +0 -38
checkpoints/checkpoint-220/generation_config.json +0 -6
checkpoints/checkpoint-220/model-00001-of-00003.safetensors +0 -3
checkpoints/checkpoint-220/model-00002-of-00003.safetensors +0 -3
checkpoints/checkpoint-220/model-00003-of-00003.safetensors +0 -3
checkpoints/checkpoint-220/model.safetensors.index.json +0 -0
checkpoints/checkpoint-220/optimizer.pt +0 -3
checkpoints/checkpoint-220/rng_state.pth +0 -3
checkpoints/checkpoint-220/scheduler.pt +0 -3
checkpoints/checkpoint-220/trainer_state.json +0 -188
checkpoints/checkpoint-220/training_args.bin +0 -3

checkpoints/checkpoint-200/config.json DELETED Viewed

@@ -1,38 +0,0 @@
-{
-  "architectures": [
-    "MyOlmoeForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "clip_qkv": null,
-  "eos_token_id": 50279,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "max_position_embeddings": 4096,
-  "max_small_expert_count": 64,
-  "model_type": "olmoe",
-  "norm_topk_prob": false,
-  "num_attention_heads": 16,
-  "num_experts": 64,
-  "num_experts_per_tok": 2,
-  "num_hidden_layers": 16,
-  "num_key_value_heads": 16,
-  "num_small_experts": 64,
-  "output_router_logits": false,
-  "pad_token_id": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "router_aux_loss_coef": 0.01,
-  "small_expert_count": 64,
-  "small_expert_intermediate_ratio": 64,
-  "small_expert_intermediate_size": 0,
-  "small_expert_sparsity_coef": 0.1,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.55.4",
-  "use_cache": true,
-  "vocab_size": 50304
-}

checkpoints/checkpoint-200/generation_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "_from_model_config": true,
-  "eos_token_id": 50279,
-  "pad_token_id": 1,
-  "transformers_version": "4.55.4"
-}

checkpoints/checkpoint-200/model-00001-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
-size 4997482624

checkpoints/checkpoint-200/model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2e21faaebae001426a76f6a3c25309632bba780041e37b7b0b22c610cbdcfbdf
-size 4997867120

checkpoints/checkpoint-200/model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:12d42d85ae49df1bd0baaa04b9ee2464265f3ae089ed723ed7910b0a0721a2ae
-size 3856242664

checkpoints/checkpoint-200/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/checkpoint-200/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ba36ff3cf5aa103f2c5a162efe65287c247dccc43ee228fd9bcb3a14b6d695ae
-size 25858571

checkpoints/checkpoint-200/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
-size 14645

checkpoints/checkpoint-200/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e1a87144f92021f7270fcad4a86cd226d4640d1deb1f882885045e9d8fc1212f
-size 1465

checkpoints/checkpoint-200/trainer_state.json DELETED Viewed

@@ -1,174 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.009811316126737676,
-  "eval_steps": 500,
-  "global_step": 200,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.0004905658063368838,
-      "grad_norm": 52.75,
-      "learning_rate": 1.471550032701112e-07,
-      "loss": 104.1645,
-      "step": 10
-    },
-    {
-      "epoch": 0.0009811316126737675,
-      "grad_norm": 57.5,
-      "learning_rate": 3.106605624591236e-07,
-      "loss": 105.2674,
-      "step": 20
-    },
-    {
-      "epoch": 0.0014716974190106514,
-      "grad_norm": 46.25,
-      "learning_rate": 4.7416612164813603e-07,
-      "loss": 105.4015,
-      "step": 30
-    },
-    {
-      "epoch": 0.001962263225347535,
-      "grad_norm": 36.25,
-      "learning_rate": 6.376716808371485e-07,
-      "loss": 105.1723,
-      "step": 40
-    },
-    {
-      "epoch": 0.002452829031684419,
-      "grad_norm": 32.75,
-      "learning_rate": 8.011772400261609e-07,
-      "loss": 105.2332,
-      "step": 50
-    },
-    {
-      "epoch": 0.0029433948380213027,
-      "grad_norm": 42.25,
-      "learning_rate": 9.646827992151733e-07,
-      "loss": 104.8549,
-      "step": 60
-    },
-    {
-      "epoch": 0.003433960644358187,
-      "grad_norm": 35.5,
-      "learning_rate": 1.1281883584041859e-06,
-      "loss": 103.9609,
-      "step": 70
-    },
-    {
-      "epoch": 0.00392452645069507,
-      "grad_norm": 37.25,
-      "learning_rate": 1.2916939175931983e-06,
-      "loss": 104.4941,
-      "step": 80
-    },
-    {
-      "epoch": 0.0044150922570319545,
-      "grad_norm": 40.0,
-      "learning_rate": 1.4551994767822106e-06,
-      "loss": 104.7141,
-      "step": 90
-    },
-    {
-      "epoch": 0.004905658063368838,
-      "grad_norm": 41.5,
-      "learning_rate": 1.618705035971223e-06,
-      "loss": 104.9211,
-      "step": 100
-    },
-    {
-      "epoch": 0.005396223869705722,
-      "grad_norm": 51.25,
-      "learning_rate": 1.7822105951602354e-06,
-      "loss": 103.7137,
-      "step": 110
-    },
-    {
-      "epoch": 0.0058867896760426055,
-      "grad_norm": 39.5,
-      "learning_rate": 1.945716154349248e-06,
-      "loss": 105.1651,
-      "step": 120
-    },
-    {
-      "epoch": 0.006377355482379489,
-      "grad_norm": 37.75,
-      "learning_rate": 2.1092217135382606e-06,
-      "loss": 105.2588,
-      "step": 130
-    },
-    {
-      "epoch": 0.006867921288716374,
-      "grad_norm": 38.25,
-      "learning_rate": 2.2727272727272728e-06,
-      "loss": 104.3058,
-      "step": 140
-    },
-    {
-      "epoch": 0.007358487095053257,
-      "grad_norm": 40.0,
-      "learning_rate": 2.4362328319162854e-06,
-      "loss": 104.7016,
-      "step": 150
-    },
-    {
-      "epoch": 0.00784905290139014,
-      "grad_norm": 41.0,
-      "learning_rate": 2.5997383911052975e-06,
-      "loss": 103.8965,
-      "step": 160
-    },
-    {
-      "epoch": 0.008339618707727025,
-      "grad_norm": 41.0,
-      "learning_rate": 2.76324395029431e-06,
-      "loss": 105.4974,
-      "step": 170
-    },
-    {
-      "epoch": 0.008830184514063909,
-      "grad_norm": 32.5,
-      "learning_rate": 2.9267495094833227e-06,
-      "loss": 104.178,
-      "step": 180
-    },
-    {
-      "epoch": 0.009320750320400792,
-      "grad_norm": 35.0,
-      "learning_rate": 3.090255068672335e-06,
-      "loss": 104.7224,
-      "step": 190
-    },
-    {
-      "epoch": 0.009811316126737676,
-      "grad_norm": 40.5,
-      "learning_rate": 3.253760627861348e-06,
-      "loss": 105.291,
-      "step": 200
-    }
-  ],
-  "logging_steps": 10,
-  "max_steps": 61155,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 20,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": false
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 5.365480915206144e+17,
-  "train_batch_size": 2,
-  "trial_name": null,
-  "trial_params": null
-}

checkpoints/checkpoint-200/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
-size 5713

checkpoints/checkpoint-220/config.json DELETED Viewed

@@ -1,38 +0,0 @@
-{
-  "architectures": [
-    "MyOlmoeForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "clip_qkv": null,
-  "eos_token_id": 50279,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "max_position_embeddings": 4096,
-  "max_small_expert_count": 64,
-  "model_type": "olmoe",
-  "norm_topk_prob": false,
-  "num_attention_heads": 16,
-  "num_experts": 64,
-  "num_experts_per_tok": 2,
-  "num_hidden_layers": 16,
-  "num_key_value_heads": 16,
-  "num_small_experts": 64,
-  "output_router_logits": false,
-  "pad_token_id": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "router_aux_loss_coef": 0.01,
-  "small_expert_count": 64,
-  "small_expert_intermediate_ratio": 64,
-  "small_expert_intermediate_size": 0,
-  "small_expert_sparsity_coef": 0.1,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.55.4",
-  "use_cache": true,
-  "vocab_size": 50304
-}

checkpoints/checkpoint-220/generation_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "_from_model_config": true,
-  "eos_token_id": 50279,
-  "pad_token_id": 1,
-  "transformers_version": "4.55.4"
-}

checkpoints/checkpoint-220/model-00001-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a1b9651bad1a045a178e22cf198d5070c94d4374f6331086e38801fe8d88ca3
-size 4997482624

checkpoints/checkpoint-220/model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0576e3f3539a0e79d172894fe362678c80b44a28cda8147ddd89c81a5f31886e
-size 4997867120

checkpoints/checkpoint-220/model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3a1605301ccd4f90faaf68572324a548bf4c5e73dcd6f90f475b808a1a247096
-size 3856242664

checkpoints/checkpoint-220/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/checkpoint-220/optimizer.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:361be11371d75e9b52d86c0ddaa928ba076a71f4b7a40ebd110351bd6b0d01c3
-size 25858571

checkpoints/checkpoint-220/rng_state.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
-size 14645

checkpoints/checkpoint-220/scheduler.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1b3c0107f4752f47d660ab9481faad64ff112b078ad06a634204030d5c8d97e9
-size 1465

checkpoints/checkpoint-220/trainer_state.json DELETED Viewed

@@ -1,188 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.010792447739411444,
-  "eval_steps": 500,
-  "global_step": 220,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.0004905658063368838,
-      "grad_norm": 52.75,
-      "learning_rate": 1.471550032701112e-07,
-      "loss": 104.1645,
-      "step": 10
-    },
-    {
-      "epoch": 0.0009811316126737675,
-      "grad_norm": 57.5,
-      "learning_rate": 3.106605624591236e-07,
-      "loss": 105.2674,
-      "step": 20
-    },
-    {
-      "epoch": 0.0014716974190106514,
-      "grad_norm": 46.25,
-      "learning_rate": 4.7416612164813603e-07,
-      "loss": 105.4015,
-      "step": 30
-    },
-    {
-      "epoch": 0.001962263225347535,
-      "grad_norm": 36.25,
-      "learning_rate": 6.376716808371485e-07,
-      "loss": 105.1723,
-      "step": 40
-    },
-    {
-      "epoch": 0.002452829031684419,
-      "grad_norm": 32.75,
-      "learning_rate": 8.011772400261609e-07,
-      "loss": 105.2332,
-      "step": 50
-    },
-    {
-      "epoch": 0.0029433948380213027,
-      "grad_norm": 42.25,
-      "learning_rate": 9.646827992151733e-07,
-      "loss": 104.8549,
-      "step": 60
-    },
-    {
-      "epoch": 0.003433960644358187,
-      "grad_norm": 35.5,
-      "learning_rate": 1.1281883584041859e-06,
-      "loss": 103.9609,
-      "step": 70
-    },
-    {
-      "epoch": 0.00392452645069507,
-      "grad_norm": 37.25,
-      "learning_rate": 1.2916939175931983e-06,
-      "loss": 104.4941,
-      "step": 80
-    },
-    {
-      "epoch": 0.0044150922570319545,
-      "grad_norm": 40.0,
-      "learning_rate": 1.4551994767822106e-06,
-      "loss": 104.7141,
-      "step": 90
-    },
-    {
-      "epoch": 0.004905658063368838,
-      "grad_norm": 41.5,
-      "learning_rate": 1.618705035971223e-06,
-      "loss": 104.9211,
-      "step": 100
-    },
-    {
-      "epoch": 0.005396223869705722,
-      "grad_norm": 51.25,
-      "learning_rate": 1.7822105951602354e-06,
-      "loss": 103.7137,
-      "step": 110
-    },
-    {
-      "epoch": 0.0058867896760426055,
-      "grad_norm": 39.5,
-      "learning_rate": 1.945716154349248e-06,
-      "loss": 105.1651,
-      "step": 120
-    },
-    {
-      "epoch": 0.006377355482379489,
-      "grad_norm": 37.75,
-      "learning_rate": 2.1092217135382606e-06,
-      "loss": 105.2588,
-      "step": 130
-    },
-    {
-      "epoch": 0.006867921288716374,
-      "grad_norm": 38.25,
-      "learning_rate": 2.2727272727272728e-06,
-      "loss": 104.3058,
-      "step": 140
-    },
-    {
-      "epoch": 0.007358487095053257,
-      "grad_norm": 40.0,
-      "learning_rate": 2.4362328319162854e-06,
-      "loss": 104.7016,
-      "step": 150
-    },
-    {
-      "epoch": 0.00784905290139014,
-      "grad_norm": 41.0,
-      "learning_rate": 2.5997383911052975e-06,
-      "loss": 103.8965,
-      "step": 160
-    },
-    {
-      "epoch": 0.008339618707727025,
-      "grad_norm": 41.0,
-      "learning_rate": 2.76324395029431e-06,
-      "loss": 105.4974,
-      "step": 170
-    },
-    {
-      "epoch": 0.008830184514063909,
-      "grad_norm": 32.5,
-      "learning_rate": 2.9267495094833227e-06,
-      "loss": 104.178,
-      "step": 180
-    },
-    {
-      "epoch": 0.009320750320400792,
-      "grad_norm": 35.0,
-      "learning_rate": 3.090255068672335e-06,
-      "loss": 104.7224,
-      "step": 190
-    },
-    {
-      "epoch": 0.009811316126737676,
-      "grad_norm": 40.5,
-      "learning_rate": 3.253760627861348e-06,
-      "loss": 105.291,
-      "step": 200
-    },
-    {
-      "epoch": 0.01030188193307456,
-      "grad_norm": 51.5,
-      "learning_rate": 3.41726618705036e-06,
-      "loss": 104.8932,
-      "step": 210
-    },
-    {
-      "epoch": 0.010792447739411444,
-      "grad_norm": 40.5,
-      "learning_rate": 3.5807717462393727e-06,
-      "loss": 104.8132,
-      "step": 220
-    }
-  ],
-  "logging_steps": 10,
-  "max_steps": 61155,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
-  "save_steps": 20,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": false
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 5.902029006726758e+17,
-  "train_batch_size": 2,
-  "trial_name": null,
-  "trial_params": null
-}

checkpoints/checkpoint-220/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ffa490bde32401dd6d70c4f1b1cff8f5df114f94b79824d3a47b2ae8c00b822d
-size 5713