minzh23 commited on
Commit
156581e
·
verified ·
1 Parent(s): eb0a328

[debug] fix config format

Browse files
qwen3_0.6b+qwen3_4b_base_Fuser/config.json CHANGED
@@ -1,82 +1,60 @@
1
- model:
2
- base_model: "Qwen/Qwen3-0.6B"
3
- teacher_model: "Qwen/Qwen3-4B-Base"
4
- include_response: false
5
- is_do_alignment: false
6
- alignment_strategy: "first"
7
- projector:
8
- type: "C2CProjector"
9
- params:
10
- hidden_dim: 1024
11
- intermediate_dim: 1024
12
- num_layers: 3
13
- dropout: 0.1
14
- initial_temperature: 1.0
15
- final_temperature: 0.001
16
- anneal_steps: 1953
17
- # projector:
18
- # type: "AllInOneProjector"
19
- # params:
20
- # hidden_dim: 1024
21
- # weight_hidden_dim: 1024
22
- # num_layers: 3
23
- # dropout: 0.1
24
- # activation: "gelu"
25
- # use_layer_norm: true
26
- # use_residual: true
27
- # use_swiglu: true
28
- # use_concat: true
29
- # gate_granularity: "scalar"
30
- # gate_depends_on_input: false
31
- # gate_input_features: "target_key"
32
- # gate_init_value: 0.0
33
- # weight_granularity: "head_merged"
34
- # weight_depends_on_input: true
35
- # weight_input_features: "target_projected_key"
36
- # weight_init_value: 0.0
37
- # use_gumbel: true
38
- # initial_temperature: 1.0
39
- # final_temperature: 0.001
40
- # preserve_target_weight: false
41
- # add_self: true
42
- # anneal_steps: 1929
43
- # scalar_temperature: 1.0
44
- # max_sequence_length: 8192
45
- mapping: "last_aligned"
46
-
47
- training:
48
- learning_rate: 0.0001
49
- weight_decay: 0.01
50
- num_epochs: 1
51
- max_length: 2048
52
- device: "cuda"
53
- scheduler_type: "linear"
54
- warmup_ratio: 0.1
55
- max_grad_norm: 1.0
56
- gradient_accumulation_steps: 4
57
- per_device_train_batch_size: 8
58
- num_processes: 8
59
- freeze:
60
- - "teacher"
61
- - "base"
62
- seed: 42
63
-
64
- output:
65
- output_dir: "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C"
66
- save_steps: 500
67
- eval_steps: 100
68
- wandb_config:
69
- project: "Rosetta"
70
- mode: "online"
71
- entity: "nics-efc"
72
- run_name: "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
73
-
74
- data:
75
- type: "OpenHermesChatDataset"
76
- # type: "DollyChatDataset"
77
-
78
- kwargs:
79
- split: "train"
80
- max_word_count: 2048
81
- num_samples: 500000
82
- train_ratio: 0.99
 
1
+ {
2
+ "model": {
3
+ "base_model": "Qwen/Qwen3-0.6B",
4
+ "teacher_model": "Qwen/Qwen3-4B-Base",
5
+ "include_response": false,
6
+ "is_do_alignment": false,
7
+ "alignment_strategy": "first",
8
+ "projector": {
9
+ "type": "C2CProjector",
10
+ "params": {
11
+ "hidden_dim": 1024,
12
+ "intermediate_dim": 1024,
13
+ "num_layers": 3,
14
+ "dropout": 0.1,
15
+ "initial_temperature": 1.0,
16
+ "final_temperature": 0.001,
17
+ "anneal_steps": 1953
18
+ }
19
+ },
20
+ "mapping": "last_aligned"
21
+ },
22
+ "training": {
23
+ "learning_rate": 0.0001,
24
+ "weight_decay": 0.01,
25
+ "num_epochs": 1,
26
+ "max_length": 2048,
27
+ "device": "cuda",
28
+ "scheduler_type": "linear",
29
+ "warmup_ratio": 0.1,
30
+ "max_grad_norm": 1.0,
31
+ "gradient_accumulation_steps": 4,
32
+ "per_device_train_batch_size": 8,
33
+ "num_processes": 8,
34
+ "freeze": [
35
+ "teacher",
36
+ "base"
37
+ ],
38
+ "seed": 42
39
+ },
40
+ "output": {
41
+ "output_dir": "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C",
42
+ "save_steps": 500,
43
+ "eval_steps": 100,
44
+ "wandb_config": {
45
+ "project": "Rosetta",
46
+ "mode": "online",
47
+ "entity": "nics-efc",
48
+ "run_name": "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
49
+ }
50
+ },
51
+ "data": {
52
+ "type": "OpenHermesChatDataset",
53
+ "kwargs": {
54
+ "split": "train",
55
+ "max_word_count": 2048,
56
+ "num_samples": 500000
57
+ },
58
+ "train_ratio": 0.99
59
+ }
60
+ }