DavidNguyen commited on
Commit
fd20217
·
verified ·
1 Parent(s): 553de27

Delete Pretrain_language_model/1BL3

Browse files
Pretrain_language_model/1BL3/deepseekv2/events.out.tfevents.1756257412.SPP00018465.511813.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fbc90ec4b87685e9b26d03e8f1307a37277af02da1e9a0b58b820705e5853d4
3
- size 29263002
 
 
 
 
Pretrain_language_model/1BL3/smoe/post_validate/args.json DELETED
@@ -1,172 +0,0 @@
1
- {
2
- "profile": null,
3
- "name": "post_validate",
4
- "save_dir": "/cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL3/smoe",
5
- "reset": 1,
6
- "log": "tb",
7
- "save_interval": "20000",
8
- "wandb_save_interval": "None",
9
- "seed": "none",
10
- "gpu": "auto",
11
- "keep_alive": 0,
12
- "sweep_id_for_grid_search": 0,
13
- "restore": "/cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL3/smoe/slimpajama_moe_no_attmoe_1B/checkpoint/model-200000.pth",
14
- "wandb_bug_workaround": 0,
15
- "wandb_sync_checkpoints": 0,
16
- "batch_size": 10,
17
- "lr": 0.00025,
18
- "min_lr_multiplier": 0.1,
19
- "wd": 0.01,
20
- "lr_warmup": 4000,
21
- "test_interval": 1000,
22
- "n_microbatch": "None",
23
- "per_device_batch_size": "16",
24
- "lr_sched.steps": "",
25
- "lr_sched.gamma": 0.1,
26
- "lr_sched.type": "cos",
27
- "length_bucketed_sampling": 0,
28
- "grad_clip": "0.25",
29
- "test_batch_size": "None",
30
- "val_log_details": 0,
31
- "reg_scales": "",
32
- "reg_lin_decay": "",
33
- "reg": 1.0,
34
- "optimizer": "adamw",
35
- "adam.betas": "0.9,0.999",
36
- "adam.eps": 1e-08,
37
- "stop_after": "200000",
38
- "amp": 1,
39
- "bfloat16": 1,
40
- "nan_detect": 0,
41
- "max_length_per_batch": "none",
42
- "log_grad_norms": 0,
43
- "speedtest": "none",
44
- "dump_logs": 0,
45
- "debug_plot_interval": "none",
46
- "lm.trafo.context_blocks": 0,
47
- "lm.trafo.test_context_blocks": "none",
48
- "lm.trafo.same_length_eval": 0,
49
- "lm.trafo.same_length": 0,
50
- "lm.trafo.last_layer_context": 0,
51
- "lm.trafo.xl_init": 0,
52
- "lm.trafo.norm_input": 0,
53
- "rope.rotate_fraction": 0.5,
54
- "rope.base": 10000.0,
55
- "pkm.n_heads": 8,
56
- "moe.n_experts": 24,
57
- "moe.expert_size": 512,
58
- "moe_name": "smoe",
59
- "moe.selection_mode": "gate",
60
- "moe.perplexity_reg": 0.01,
61
- "moe.perplexity_reg_mode": "step",
62
- "moe.att.perplexity_reg_mode": "none",
63
- "moe.activation_after_topk": 0,
64
- "moe.att.expert_size": 256,
65
- "moe.topk": 2,
66
- "moe.bias": 0,
67
- "moe.sel_bias": 0,
68
- "moe.dropout_factor": 1.0,
69
- "moe.drop_expert": 0.0,
70
- "moe.sync_distributed": 1,
71
- "moe.init_scale": 1.0,
72
- "moe.att.n_experts": 4,
73
- "moe.att.enable": 0,
74
- "moe.att.q_expert": 1,
75
- "moe.att.k_expert": 1,
76
- "moe.att.v_expert": 1,
77
- "moe.att.o_expert": 1,
78
- "moe.att.k": 2,
79
- "moe.att.v_size": "none",
80
- "moe.att.same_sel": 0,
81
- "moe.att.expert_dropout": "none",
82
- "moe.att.selection_mode": "sigmoid",
83
- "moe.att.perplexity_reg": "none",
84
- "moe.att.drop_expert": "none",
85
- "moe.att.separate_kq_sel": 0,
86
- "moe.att.norm_init": 0,
87
- "moe.att.dropout": 0.0,
88
- "moe.att.selection_dropout": 0.0,
89
- "moe.nonorm": 0,
90
- "in_topk": 0,
91
- "balance_affinity": 1,
92
- "is_cosine": 0,
93
- "is_norm_weight": 0,
94
- "norm_softmax": 0,
95
- "norm_sigmoid": 0,
96
- "moa.cvloss": 0.0,
97
- "moa.switchloss": 0.0,
98
- "moa.zloss": 0.0,
99
- "balance_loss_coef": 0.01,
100
- "balance_loss_coef_comp": 0.001,
101
- "router_z_loss_coef": 0.001,
102
- "router_loss_coef": 0.01,
103
- "max_compete_in_iter": 3,
104
- "warm_up": 0.05,
105
- "rate_flip": 0.07,
106
- "router_theta": 0.1,
107
- "scale_weight": 1.0,
108
- "hybrid": 0,
109
- "tribrid": 0,
110
- "moa.miloss": 0.0,
111
- "sut.sample_topk": 0,
112
- "sut.max_relative_positions": 64,
113
- "sut.drop_gate": 0.0,
114
- "moe.selection_dropout": 0.0,
115
- "moe.layer_std_constant": 2.0,
116
- "transformer.universal.group_size": 24,
117
- "transformer.universal.group_type": "abab",
118
- "transformer.embedding_scale": "none",
119
- "transformer.topk_value": 32,
120
- "transformer.activation": "relu",
121
- "transformer.p_drop_layer": 0.0,
122
- "transformer.head_projection_size": "128",
123
- "transformer.act_loss": 0.0,
124
- "transformer.plot_head_details": 0,
125
- "lm.trafo.force_out_norm": 0,
126
- "plot.n_steps": -128,
127
- "dump_validation_plots": "",
128
- "details_log_interval": "500",
129
- "lm.state_drop_probability": 0.0,
130
- "lm.unroll": 1024,
131
- "lm.unroll_eval": "none",
132
- "lm.example_context": 100,
133
- "lm.example_window": 40,
134
- "lm.eval.blimp.batch_mul": 16,
135
- "lm.eval.enabled": 1,
136
- "lm.eval.lambada.enabled": 1,
137
- "lm.eval.cbt.batch_mul": 1,
138
- "lm.eval.cbt.length_limit": "none",
139
- "lm.eval.cbt.enabled": 1,
140
- "lm.eval.cbt.end_only": 0,
141
- "lm.eval.blimp.enabled": 1,
142
- "lm.eval.hellaswag.enabled": 1,
143
- "lm.eval.hellaswag.batch_mul": 16,
144
- "lm.eval.piqa.enabled": 1,
145
- "lm.eval.piqa.batch_mul": 16,
146
- "lm.eval.ai2arc.enabled": 1,
147
- "lm.eval.ai2arc.batch_mul": 4,
148
- "lm.eval.mmlu.enabled": 0,
149
- "lm.eval.openbookqa.enabled": 0,
150
- "lm.eval.race.enabled": 0,
151
- "lm.eval.siqa.enabled": 0,
152
- "lm.eval.winogrande.enabled": 0,
153
- "lm.eval.commonsenseqa.enabled": 0,
154
- "sentencepiece.n_pieces": 8000,
155
- "lmds.valid_ratio": 0.005,
156
- "thestack.languages": "python,html,c++,rust,javascript,haskell,scala,assembly",
157
- "state_size": 1024,
158
- "task": "slimpajama_transformer",
159
- "dropout": 0.0,
160
- "embedding_size": "none",
161
- "transformer.n_heads": 32,
162
- "transformer.variant": "preln_moe",
163
- "transformer.ff_multiplier": 2.0,
164
- "transformer.encoder_n_layers": 24,
165
- "transformer.attention_dropout": 0.0,
166
- "load_pretrained_model": null,
167
- "test_pretrained": 1,
168
- "train_baseline": 0,
169
- "test_only": 1,
170
- "save_name_logs": "results",
171
- "fs_cache_pattern": "*"
172
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Pretrain_language_model/1BL3/smoe/post_validate/startup_log.txt DELETED
@@ -1 +0,0 @@
1
- 2025-08-25 16:16:59.949247 SPP00018465: main.py --name post_validate --restore /cm/archive/namnv78_new/revise_checkpoints/pretrain_final/1BL3/smoe/slimpajama_moe_no_attmoe_1B/checkpoint/model-200000.pth --test_only 1 -reset 1 -lm.eval.enabled 1 -lm.eval.lambada.enabled 1 -lm.eval.cbt.enabled 1 -lm.eval.hellaswag.enabled 1 -lm.eval.piqa.enabled 1 -lm.eval.blimp.enabled 1 -lm.eval.ai2arc.enabled 1 --keep_alive 0 --batch_size 10
 
 
Pretrain_language_model/1BL3/smoe/post_validate/tensorboard/events.out.tfevents.1756113419.SPP00018465.1389420.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:813e3d419b1be4dd3ef6015a010b3eb7216b374563e7397a58285274cef675f4
3
- size 7763