Ba2han commited on
Commit
e79d758
·
verified ·
1 Parent(s): 697872c

Training in progress, step 975, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -12,9 +12,9 @@
12
  "eos_token_id": 50031,
13
  "head_dim": 64,
14
  "hidden_act": "silu",
15
- "hidden_size": 1024,
16
  "initializer_range": 0.02,
17
- "intermediate_size": 2816,
18
  "layer_types": [
19
  "full_attention",
20
  "full_attention",
@@ -57,34 +57,48 @@
57
  "full_attention",
58
  "full_attention",
59
  "full_attention",
 
 
 
 
 
 
60
  "full_attention"
61
  ],
62
  "max_position_embeddings": 8192,
63
- "max_window_layers": 42,
 
64
  "model_name": "test_checkpoint",
65
  "model_type": "qwen3",
 
66
  "num_attention_heads": 16,
67
- "num_hidden_layers": 42,
68
  "num_key_value_heads": 4,
69
  "pad_token_id": 50034,
70
- "qk_norm_freeze_affine": true,
71
- "resid_lambda_init": 1.0,
 
 
 
 
72
  "rms_norm_eps": 1e-06,
73
  "rope_parameters": {
74
  "rope_theta": 50000,
75
  "rope_type": "default"
76
  },
 
77
  "sliding_window": null,
78
- "softcap_divisor": 7.5,
79
- "softcap_logits": true,
80
- "softcap_scale": 23.0,
81
- "softcap_shift": 5.0,
82
- "tie_word_embeddings": true,
83
  "transformers_version": "5.7.0",
84
  "unsloth_version": "2026.4.8",
85
  "use_cache": false,
86
- "use_qk_norm_patch": true,
87
  "use_sliding_window": false,
88
  "vocab_size": 50048,
89
- "x0_lambda_init": 0.1
 
 
 
 
90
  }
 
12
  "eos_token_id": 50031,
13
  "head_dim": 64,
14
  "hidden_act": "silu",
15
+ "hidden_size": 1152,
16
  "initializer_range": 0.02,
17
+ "intermediate_size": 2880,
18
  "layer_types": [
19
  "full_attention",
20
  "full_attention",
 
57
  "full_attention",
58
  "full_attention",
59
  "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
  "full_attention"
67
  ],
68
  "max_position_embeddings": 8192,
69
+ "max_window_layers": 48,
70
+ "mlp_type": "squared_relu",
71
  "model_name": "test_checkpoint",
72
  "model_type": "qwen3",
73
+ "n_layer": 48,
74
  "num_attention_heads": 16,
75
+ "num_hidden_layers": 48,
76
  "num_key_value_heads": 4,
77
  "pad_token_id": 50034,
78
+ "resid_lambda_end": 1.05,
79
+ "resid_lambda_init": 1.15,
80
+ "resid_lambda_init_end": 1.05,
81
+ "resid_lambda_init_start": 1.15,
82
+ "resid_scalar_lr_mult": 0.01,
83
+ "resid_scalar_weight_decay": 0.05,
84
  "rms_norm_eps": 1e-06,
85
  "rope_parameters": {
86
  "rope_theta": 50000,
87
  "rope_type": "default"
88
  },
89
+ "scalar_lr": 0.5,
90
  "sliding_window": null,
91
+ "squared_relu_activation": "relu2",
92
+ "squared_relu_intermediate_size": 2880,
93
+ "tie_word_embeddings": false,
 
 
94
  "transformers_version": "5.7.0",
95
  "unsloth_version": "2026.4.8",
96
  "use_cache": false,
 
97
  "use_sliding_window": false,
98
  "vocab_size": 50048,
99
+ "x0_lambda_end": 0.0,
100
+ "x0_lambda_init": 0.02,
101
+ "x0_lambda_init_end": 0.05,
102
+ "x0_lambda_init_start": 0.2,
103
+ "x0_scalar_weight_decay": 0.0
104
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51479eb47d8f84df8c30abb50d9b3351df21b89bd08f633b42fdc763a98a6afc
3
- size 1049600104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f53d74af2b3082df05a025c64c68722ca660725afe94f4ae6de06107074aea1
3
+ size 1151039640
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68c404a016a8c58db8c4edd6542284b1ca315086d194e34fb712c4c51033f15d
3
- size 2100565997
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc43251426c01c5194817d1ba92cdf96daaad2adc64751ea1aef393a6abc73b4
3
+ size 1845917173
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6798a8c586f26248338e3100b61cf55d872d73760b54611eb7a3a02446b02b8
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5ceb4c5544b6dc38fdd0f65e3d1e312afea764e312ae37fab4d14e1151e068
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98b5e40ea08c3df3a67cfcbc7bd13f852d13ab6f405453bac397e60b19957472
3
- size 9785
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c38cbc81da7001df5721d532f7ec63029b08f4cd8a871e76dff5146ee8c29f4
3
+ size 1529
last-checkpoint/trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07a9b855879219e71fab07ff139f60083d82e65b9fef7b37c3d43fa66609d389
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b350586eae4abc083bd0fbabe256a8a042109d3a895c5dbd482bedadc195db10
3
  size 5777