DavidNguyen commited on
Commit
a7f01c8
·
verified ·
1 Parent(s): cc6627b

b8c6e6efd9ab7a32a40ea8aa2a9aca807b546bf2873e10d07437ba9d8a8852d0

Browse files
Pretrain_language_model/1B/1BL2/smoe_sigmoid/post_validate/args.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "profile": null,
3
+ "name": "post_validate",
4
+ "save_dir": "/cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL2/smoe_sigmoid",
5
+ "reset": 1,
6
+ "log": "tb",
7
+ "save_interval": "20000",
8
+ "wandb_save_interval": "None",
9
+ "seed": "none",
10
+ "gpu": "auto",
11
+ "keep_alive": 0,
12
+ "sweep_id_for_grid_search": 0,
13
+ "restore": "/cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL2/smoe_sigmoid/slimpajama_moe_no_attmoe_1B/checkpoint/model-200000.pth",
14
+ "wandb_bug_workaround": 0,
15
+ "wandb_sync_checkpoints": 0,
16
+ "batch_size": 10,
17
+ "lr": 0.00025,
18
+ "min_lr_multiplier": 0.1,
19
+ "wd": 0.01,
20
+ "lr_warmup": 4000,
21
+ "test_interval": 1000,
22
+ "n_microbatch": "None",
23
+ "per_device_batch_size": "16",
24
+ "lr_sched.steps": "",
25
+ "lr_sched.gamma": 0.1,
26
+ "lr_sched.type": "cos",
27
+ "length_bucketed_sampling": 0,
28
+ "grad_clip": "0.25",
29
+ "test_batch_size": "None",
30
+ "val_log_details": 0,
31
+ "reg_scales": "",
32
+ "reg_lin_decay": "",
33
+ "reg": 1.0,
34
+ "optimizer": "adamw",
35
+ "adam.betas": "0.9,0.999",
36
+ "adam.eps": 1e-08,
37
+ "stop_after": "200000",
38
+ "amp": 1,
39
+ "bfloat16": 1,
40
+ "nan_detect": 0,
41
+ "max_length_per_batch": "none",
42
+ "log_grad_norms": 0,
43
+ "speedtest": "none",
44
+ "dump_logs": 0,
45
+ "debug_plot_interval": "none",
46
+ "lm.trafo.context_blocks": 0,
47
+ "lm.trafo.test_context_blocks": "none",
48
+ "lm.trafo.same_length_eval": 0,
49
+ "lm.trafo.same_length": 0,
50
+ "lm.trafo.last_layer_context": 0,
51
+ "lm.trafo.xl_init": 0,
52
+ "lm.trafo.norm_input": 0,
53
+ "rope.rotate_fraction": 0.5,
54
+ "rope.base": 10000.0,
55
+ "pkm.n_heads": 8,
56
+ "moe.n_experts": 24,
57
+ "moe.expert_size": 512,
58
+ "moe_name": "smoe_sigmoid",
59
+ "moe.selection_mode": "gate",
60
+ "moe.perplexity_reg": 0.01,
61
+ "moe.perplexity_reg_mode": "step",
62
+ "moe.att.perplexity_reg_mode": "none",
63
+ "moe.activation_after_topk": 0,
64
+ "moe.att.expert_size": 256,
65
+ "moe.topk": 2,
66
+ "moe.bias": 0,
67
+ "moe.sel_bias": 0,
68
+ "moe.dropout_factor": 1.0,
69
+ "moe.drop_expert": 0.0,
70
+ "moe.sync_distributed": 1,
71
+ "moe.init_scale": 1.0,
72
+ "moe.att.n_experts": 4,
73
+ "moe.att.enable": 0,
74
+ "moe.att.q_expert": 1,
75
+ "moe.att.k_expert": 1,
76
+ "moe.att.v_expert": 1,
77
+ "moe.att.o_expert": 1,
78
+ "moe.att.k": 2,
79
+ "moe.att.v_size": "none",
80
+ "moe.att.same_sel": 0,
81
+ "moe.att.expert_dropout": "none",
82
+ "moe.att.selection_mode": "sigmoid",
83
+ "moe.att.perplexity_reg": "none",
84
+ "moe.att.drop_expert": "none",
85
+ "moe.att.separate_kq_sel": 0,
86
+ "moe.att.norm_init": 0,
87
+ "moe.att.dropout": 0.0,
88
+ "moe.att.selection_dropout": 0.0,
89
+ "moe.nonorm": 0,
90
+ "in_topk": 0,
91
+ "balance_affinity": 1,
92
+ "is_cosine": 0,
93
+ "is_norm_weight": 0,
94
+ "norm_softmax": 0,
95
+ "norm_sigmoid": 0,
96
+ "moa.cvloss": 0.0,
97
+ "moa.switchloss": 0.0,
98
+ "moa.zloss": 0.0,
99
+ "balance_loss_coef": 0.01,
100
+ "balance_loss_coef_comp": 0.001,
101
+ "router_z_loss_coef": 0.001,
102
+ "router_loss_coef": 0.01,
103
+ "max_compete_in_iter": 3,
104
+ "warm_up": 0.05,
105
+ "rate_flip": 0.07,
106
+ "router_theta": 0.1,
107
+ "scale_weight": 1.0,
108
+ "hybrid": 0,
109
+ "tribrid": 0,
110
+ "moa.miloss": 0.0,
111
+ "sut.sample_topk": 0,
112
+ "sut.max_relative_positions": 64,
113
+ "sut.drop_gate": 0.0,
114
+ "moe.selection_dropout": 0.0,
115
+ "moe.layer_std_constant": 2.0,
116
+ "transformer.universal.group_size": 24,
117
+ "transformer.universal.group_type": "abab",
118
+ "transformer.embedding_scale": "none",
119
+ "transformer.topk_value": 32,
120
+ "transformer.activation": "relu",
121
+ "transformer.p_drop_layer": 0.0,
122
+ "transformer.head_projection_size": "128",
123
+ "transformer.act_loss": 0.0,
124
+ "transformer.plot_head_details": 0,
125
+ "lm.trafo.force_out_norm": 0,
126
+ "plot.n_steps": -128,
127
+ "dump_validation_plots": "",
128
+ "details_log_interval": "500",
129
+ "lm.state_drop_probability": 0.0,
130
+ "lm.unroll": 1024,
131
+ "lm.unroll_eval": "none",
132
+ "lm.example_context": 100,
133
+ "lm.example_window": 40,
134
+ "lm.eval.blimp.batch_mul": 16,
135
+ "lm.eval.enabled": 1,
136
+ "lm.eval.lambada.enabled": 1,
137
+ "lm.eval.cbt.batch_mul": 1,
138
+ "lm.eval.cbt.length_limit": "none",
139
+ "lm.eval.cbt.enabled": 1,
140
+ "lm.eval.cbt.end_only": 0,
141
+ "lm.eval.blimp.enabled": 1,
142
+ "lm.eval.hellaswag.enabled": 1,
143
+ "lm.eval.hellaswag.batch_mul": 16,
144
+ "lm.eval.piqa.enabled": 1,
145
+ "lm.eval.piqa.batch_mul": 16,
146
+ "lm.eval.ai2arc.enabled": 1,
147
+ "lm.eval.ai2arc.batch_mul": 4,
148
+ "lm.eval.mmlu.enabled": 0,
149
+ "lm.eval.openbookqa.enabled": 0,
150
+ "lm.eval.race.enabled": 0,
151
+ "lm.eval.siqa.enabled": 0,
152
+ "lm.eval.winogrande.enabled": 0,
153
+ "lm.eval.commonsenseqa.enabled": 0,
154
+ "sentencepiece.n_pieces": 8000,
155
+ "lmds.valid_ratio": 0.005,
156
+ "thestack.languages": "python,html,c++,rust,javascript,haskell,scala,assembly",
157
+ "state_size": 1024,
158
+ "task": "slimpajama_transformer",
159
+ "dropout": 0.0,
160
+ "embedding_size": "none",
161
+ "transformer.n_heads": 32,
162
+ "transformer.variant": "preln_moe",
163
+ "transformer.ff_multiplier": 2.0,
164
+ "transformer.encoder_n_layers": 24,
165
+ "transformer.attention_dropout": 0.0,
166
+ "load_pretrained_model": null,
167
+ "test_pretrained": 1,
168
+ "train_baseline": 0,
169
+ "test_only": 1,
170
+ "save_name_logs": "results",
171
+ "fs_cache_pattern": "*"
172
+ }
Pretrain_language_model/1B/1BL2/smoe_sigmoid/post_validate/startup_log.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 2025-08-23 09:21:21.099864 SPP00018465: main.py --name post_validate --restore /cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL2/smoe_sigmoid/slimpajama_moe_no_attmoe_1B/checkpoint/model-200000.pth --test_only 1 -reset 1 -lm.eval.enabled 1 -lm.eval.lambada.enabled 1 -lm.eval.cbt.enabled 1 -lm.eval.hellaswag.enabled 1 -lm.eval.piqa.enabled 1 -lm.eval.blimp.enabled 1 -lm.eval.ai2arc.enabled 1 --keep_alive 0 --batch_size 20
2
+ 2025-08-23 11:04:41.934938 SPP00018465: main.py --name post_validate --restore /cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL2/smoe_sigmoid/slimpajama_moe_no_attmoe_1B/checkpoint/model-200000.pth --test_only 1 -reset 1 -lm.eval.enabled 1 -lm.eval.lambada.enabled 1 -lm.eval.cbt.enabled 1 -lm.eval.hellaswag.enabled 1 -lm.eval.piqa.enabled 1 -lm.eval.blimp.enabled 1 -lm.eval.ai2arc.enabled 1 --keep_alive 0 --batch_size 10
Pretrain_language_model/1B/1BL2/smoe_sigmoid/post_validate/tensorboard/events.out.tfevents.1755921881.SPP00018465.925002.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3869cd86db2cb15ffecf4dbb6099647c38e979c83a5a95d151b68712e571dc74
3
+ size 282