YHLLEO
/

DSMoE-3B-E16

Model card Files Files and versions

YHLLEO commited on Dec 1, 2025

Commit

ad05546

·

verified ·

1 Parent(s): 0345cab

Upload with explicit token

Files changed (2) hide show

checkpoints/0700000.pt +3 -0
config_2025-11-19T10-09-48.yaml +59 -0

checkpoints/0700000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16b2618f708c61d6552f9f067aab93b8226aa6e0e5814bf68756129a5ef5ba01
+size 24814462194

config_2025-11-19T10-09-48.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+basic:
+  exp_name: 000_DSMoE_3B_E16_Flow
+  results_dir: exps/
+  data_path: /mmu_nlp_hdd/liuyahui06/datasets/imagenet/train
+  global_seed: 1234
+  epochs: 1000
+  log_every: 100
+  ckpt_every: 50000
+  rf: true
+  rf_ori: false
+  accum_iter: 1
+  clip_grad_norm: null
+  image_size: 256
+  global_batch_size: 512
+  num_workers: 16
+  timestep_start: 0
+  timestep_end: 1000
+  vae_path: stabilityai/sd-vae-ft-mse
+model:
+  ckpt: null
+  target: models.models_DSMoE.DiT
+  params:
+    input_size: 32
+    num_classes: 1000
+    patch_size: 2
+    depth: 30
+    hidden_size: 1152
+    num_heads: 16
+    mlp_ratio: 4
+    use_swiglu: false
+    rope_type: 2d
+    use_sinks: false
+    sliding_window: 0
+    enable_gqa: false
+    norm_type: layernorm
+    MoE_config:
+      num_experts: 16
+      hidden_size: 1152
+      moe_intermediate_size: 2880
+      n_group: 2
+      topk_group: 2
+      num_experts_per_tok: 2
+      routed_scaling_factor: 2.0
+      capacity: 1
+      init_MoeMLP: false
+      interleave: true
+      skip_first2: false
+      skip_last2: false
+      use_shared_expert: true
+    CapacityPred_loss_weight: 1
+optim:
+  base_learning_rate: 0.0001
+  weight_decay: 0
+  betas:
+  - 0.9
+  - 0.999
+lr_sheduler:
+  warmup: null
+  train_epoch: null