Text Generation
Transformers
Safetensors
llada2_moe
conversational
custom_code
File size: 4,998 Bytes
2c4aa79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
CUDA_VISIBLE_DEVICES=0 python demo.py

PYTHONPATH=$(pwd)/VeOmni:$PYTHONPATH sh train.sh tasks/train_llada2_bd.py configs/sft/llada2_mini_bd_sft.yaml

PYTHONPATH=$(pwd)/VeOmni:$PYTHONPATH sh train.sh tasks/train_llada2_bd_semi2.py configs/sft/llada2_mini_bd_sft_new.yaml

PYTHONPATH=$(pwd)/VeOmni:$PYTHONPATH sh train.sh tasks/train_llada2_bd_hybrid.py configs/sft/llada2_mini_bd_sft_new.yaml

sft2 batchsize=8
sft3 batchsize=32
sft4 batchsize=8

python scripts/moe_convertor.py \
  --input-path /scratch/e0973935/model_weights/local_LLaDA2.1-mini \
  --output-path /scratch/e0973935/model_weights/local_LLaDA2.1-mini-merge \
  --mode merge

python scripts/moe_convertor.py \
  --input-path /scratch/e0973935/model_weights/llada2.0_mini_sft_27 \
  --output-path /scratch/e0973935/model_weights/local_LLaDA2.0-mini-merge-cust \
  --mode merge

python scripts/moe_convertor.py \
  --input-path /scratch/e0973935/dFactory/llada2_mini_bd_sft_outputs_mathabla/checkpoints/global_step_179430/hf_ckpt \
  --output-path /scratch/e0973935/model_weights/llada2.0_mini_abla \
  --mode split

qsub -I \
 -P CFP03-SF-102 \
 -l select=1:ngpus=2 \
 -l walltime=1:40:00



outputs3 online 0.6-1.0 lr=1e-6 bsz=8
outputs4 online 0.6-1.0 lr=1e-5 bsz=64
outputs5 online 0.4-0.8 lr=1e-6 bsz=8 allmath
outputs6 online 0.4-0.8 lr=1e-6 bsz=8 allmath onpolicyremask
outputs7 online 0.6-0.8 lr=1e-6 bsz=8 allmath
outputs8 online 0.6-0.8 lr=2e-6 bsz=8 allmath
outputs9 online 0.3-0.8 lr=1e-6 bsz=8 allmath ar-mask
outputs10 online 0.0-1.0 lr=1e-6 bsz=8 allmath ar-mask
outputs11 online 0.6-0.8 lr=1e-6 bsz=8 allmath+
outputs12 online 0.6-0.8 lr=5e-7 bsz=8 allmath+
outputs13 online 0.6-0.8 lr=1e-6 bsz=8 allmath+ block=64
outputs14 online 0.6-0.8 lr=2e-6 bsz=8 allmath+
outputs16 online 0.3-0.8 lr=1e-6 bsz=8 allmath+ ar-mask-8 label-mask
outputs17 online 0.3-0.5 lr=1e-6 bsz=8 allmath ar-mask
outputs18 online 0.6-0.8 lr=4e-6 bsz=8 allmath+
outputs19 online 0.6-0.8 lr=1e-5 bsz=8 allmath+
outputs20 online 0.6-0.8 lr=4e-6 bsz=8 allmath+ blockrand
outputs21 online 0.7-0.7 lr=4e-6 bsz=8 allmath+

outputs23 online 0.3-0.8 lr=2e-6 bsz=8 allmath+ ar-mask
outputs24 online 0.3-0.8 lr=2e-6 bsz=8 allmath+ gumblemask
outputs25 online 0.6-0.8 lr=2e-6 bsz=8 allmath+ gumblemask

outputs26 online 0.6-0.8 lr=2e-6 bsz=8 allmath++ 
outputs27 online 0.75    lr=2e-6 bsz=8 allmath++ 
outputs28 online 0.6-0.8 lr=2e-6 bsz=8 allmath++ label-mask
outputs29 online 0.75    lr=2e-6 bsz=8 allmath++ gumblemask thresh=0.5
outputs30 online 0.75    lr=2e-6 bsz=8 allmath++ gumblemask thresh=0.3
outputs31 online 0.5-0.8 lr=2e-6 bsz=8 allmath++ gumblemask thresh=0.3
outputs32 online 0.75       lr=2e-6 bsz=8 allmath+ rkd
outputs33 online 0.6-1.0    lr=2e-6 bsz=8 allmath+ rkd
outputs34 online 0.75       lr=2e-6 bsz=8 allmath+ rkd w0.25
outputs36 online 0.75       lr=2e-6 bsz=8 allmath+ ar-attention
outputs37 online 0.75       lr=2e-6 bsz=8 allmath+ ar-attention-no-uni

outputs38 online 0.75       lr=2e-6 bsz=8 allmath+ cont k=3
outputs39 online 0.6-0.8    lr=2e-6 bsz=8 allmath+ cont k=3
outputs40 online 0.75       lr=2e-6 bsz=8 allmath+ cont k=1
outputs41 online 0.6-1.0    lr=2e-6 bsz=8 allmath+ cont k=1
outputs42 online 0.75       lr=2e-6 bsz=8 allcode+

outputs43 online 0.75       lr=2e-6 bsz=8 allmath+ cont-norm k=1
outputs44 online 0.6-1.0    lr=2e-6 bsz=8 allmath+ cont-norm k=1
outputs45 online 0.75       lr=2e-6 bsz=8 allmath+ cont-norm k=3
outputs47 online 0.75       lr=2e-6 bsz=8 allmath+ cont-norm nomask k=3

outputs48 online 0.6-0.9    lr=2e-6 bsz=8 allmath+ 
outputs49 online 0.7-0.9    lr=2e-6 bsz=8 allcode+-  
outputs50 online 0.75    lr=2e-6 bsz=8 allcode+- 
outputs51 online 0.6-0.8    lr=2e-6 bsz=8 allcode+  
outputs52 online 0.75    lr=2e-6 bsz=8 allmath++ 27+epoch2 


outputs61 online 0.8    lr=2e-6 bsz=4 codefinal epoch=1 

export PYTHONPATH="/scratch/e0973935/dInfer/python:${PYTHONPATH}"
python -c "import dinfer; print(dinfer.__file__)"


amgr login

hpc project

CUDA_VISIBLE_DEVICES=0,1,2,3 python load.py

deepspeed --include localhost:0 train_compress_ed2.py

deepspeed --num_nodes=1 --num_gpus=8 train_compress3.py


MAX_JOBS=4 pip install flash-attn --no-build-isolation

MAX_JOBS=64 pip install flash_attn==2.8.3 --no-build-isolation

pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp310-cp310-linux_x86_64.whl

scp -r /home/svu/e0973935/CompThinker /scratch/e0973935

scp -r /scratch/e0973935/model_weights/custom_Qwen3-1.7B /scratch/e0950166

scp -r /Users/yuruonan/Downloads/VITON_traindata/*  yuruonan@deep40:/scratch/e0973935/model_weights/custom_Qwen3-1.7B

scp -r e0973935@hopper.nus.edu.sg:/scratch/e0973935/model_weights/custom_Qwen3-1.7B /Users/zigeng/Downloads/nips26/models

/Project_Storage/CFP-03/CFP03-SF-102

scp -r /scratch/e0973935/model_weights/llada2.0_mini_sft_70 /Project_Storage/CFP-03/CFP03-SF-102

scp -r /Project_Storage/CFP-03/CFP03-SF-102/llada2.0_mini_sft_70_5 /scratch/e0973935/model_weights/