Ba2han commited on
Commit
334dfeb
·
1 Parent(s): f20bbeb

Training in progress, step 975

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. config.json +27 -13
README.md CHANGED
@@ -4,8 +4,8 @@ model_name: experimental2
4
  tags:
5
  - generated_from_trainer
6
  - trl
7
- - sft
8
  - unsloth
 
9
  licence: license
10
  ---
11
 
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/batuhan409/huggingface/runs/0blltk43)
31
 
32
 
33
  This model was trained with SFT.
 
4
  tags:
5
  - generated_from_trainer
6
  - trl
 
7
  - unsloth
8
+ - sft
9
  licence: license
10
  ---
11
 
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/batuhan409/huggingface/runs/dd1eu3b5)
31
 
32
 
33
  This model was trained with SFT.
config.json CHANGED
@@ -12,9 +12,9 @@
12
  "eos_token_id": 50031,
13
  "head_dim": 64,
14
  "hidden_act": "silu",
15
- "hidden_size": 1024,
16
  "initializer_range": 0.02,
17
- "intermediate_size": 2816,
18
  "layer_types": [
19
  "full_attention",
20
  "full_attention",
@@ -57,34 +57,48 @@
57
  "full_attention",
58
  "full_attention",
59
  "full_attention",
 
 
 
 
 
 
60
  "full_attention"
61
  ],
62
  "max_position_embeddings": 8192,
63
- "max_window_layers": 42,
 
64
  "model_name": "test_checkpoint",
65
  "model_type": "qwen3",
 
66
  "num_attention_heads": 16,
67
- "num_hidden_layers": 42,
68
  "num_key_value_heads": 4,
69
  "pad_token_id": 50034,
70
- "qk_norm_freeze_affine": true,
71
- "resid_lambda_init": 1.0,
 
 
 
 
72
  "rms_norm_eps": 1e-06,
73
  "rope_parameters": {
74
  "rope_theta": 50000,
75
  "rope_type": "default"
76
  },
 
77
  "sliding_window": null,
78
- "softcap_divisor": 7.5,
79
- "softcap_logits": true,
80
- "softcap_scale": 23.0,
81
- "softcap_shift": 5.0,
82
- "tie_word_embeddings": true,
83
  "transformers_version": "5.7.0",
84
  "unsloth_version": "2026.4.8",
85
  "use_cache": false,
86
- "use_qk_norm_patch": true,
87
  "use_sliding_window": false,
88
  "vocab_size": 50048,
89
- "x0_lambda_init": 0.1
 
 
 
 
90
  }
 
12
  "eos_token_id": 50031,
13
  "head_dim": 64,
14
  "hidden_act": "silu",
15
+ "hidden_size": 1152,
16
  "initializer_range": 0.02,
17
+ "intermediate_size": 2880,
18
  "layer_types": [
19
  "full_attention",
20
  "full_attention",
 
57
  "full_attention",
58
  "full_attention",
59
  "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
  "full_attention"
67
  ],
68
  "max_position_embeddings": 8192,
69
+ "max_window_layers": 48,
70
+ "mlp_type": "squared_relu",
71
  "model_name": "test_checkpoint",
72
  "model_type": "qwen3",
73
+ "n_layer": 48,
74
  "num_attention_heads": 16,
75
+ "num_hidden_layers": 48,
76
  "num_key_value_heads": 4,
77
  "pad_token_id": 50034,
78
+ "resid_lambda_end": 1.05,
79
+ "resid_lambda_init": 1.15,
80
+ "resid_lambda_init_end": 1.05,
81
+ "resid_lambda_init_start": 1.15,
82
+ "resid_scalar_lr_mult": 0.01,
83
+ "resid_scalar_weight_decay": 0.05,
84
  "rms_norm_eps": 1e-06,
85
  "rope_parameters": {
86
  "rope_theta": 50000,
87
  "rope_type": "default"
88
  },
89
+ "scalar_lr": 0.5,
90
  "sliding_window": null,
91
+ "squared_relu_activation": "relu2",
92
+ "squared_relu_intermediate_size": 2880,
93
+ "tie_word_embeddings": false,
 
 
94
  "transformers_version": "5.7.0",
95
  "unsloth_version": "2026.4.8",
96
  "use_cache": false,
 
97
  "use_sliding_window": false,
98
  "vocab_size": 50048,
99
+ "x0_lambda_end": 0.0,
100
+ "x0_lambda_init": 0.02,
101
+ "x0_lambda_init_end": 0.05,
102
+ "x0_lambda_init_start": 0.2,
103
+ "x0_scalar_weight_decay": 0.0
104
  }