File size: 7,882 Bytes
bf5d0ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
RC_augmentation: false
_dataset_cfg_lookup:
  dlb_cmp_gm12878:
    eval_split: validation
    hf_path: jzshared/dlb_cmp_gm12878
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_gm12878
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
  dlb_cmp_h1hesc:
    eval_split: validation
    hf_path: jzshared/dlb_cmp_h1hesc
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_h1hesc
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
  dlb_cmp_hct116:
    eval_split: validation
    hf_path: jzshared/dlb_cmp_hct116
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_hct116
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
  dlb_cmp_hff:
    eval_split: validation
    hf_path: jzshared/dlb_cmp_hff
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_hff
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
  dlb_cmp_imr90:
    eval_split: validation
    hf_path: jzshared/dlb_cmp_imr90
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_imr90
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
  euks_refseq_region_12.8k:
    hf_path: jzshared/euks_refseq_all_12p8k_merged_10m_20260302
    path: data/euks_refseq_all_12p8k_merged_10m_20260302
    type: refseq
  gencode128k_basic:
    hf_path: jzshared/gencode128k_basic
    path: data/gencode128k_basic
    type: refseq
  gencode128k_debug:
    hf_path: jzshared/gencode128k_debug
    path: data/gencode128k_debug
    type: refseq
  gencode_human_12.8k:
    hf_path: jzshared/gencode_human_12.8k
    path: data/gencode_human_12.8k
    type: refseq
  gencode_human_128k:
    hf_path: jzshared/gencode_human_128k
    path: data/gencode_human_128k
    type: refseq
  hg38_128k:
    hf_path: jzshared/hg38_cds_anchored_128000
    path: data/hg38_cds_anchored_128000
    type: refseq
  hg38_12k:
    hf_path: jzshared/hg38_12800
    path: data/hg38_cds_anchored_len12800_mincds150_1000000samples
    type: refseq
  hg38_cds_4m:
    hf_path: null
    path: data/hg38_cds_dataset_4m_filtered
    type: refseq
  orca32m_cmp_seq:
    eval_split: validation
    hf_path: jzshared/orca32m_cmp
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/orca32m_cmp_seq
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
_unimportant_cfg:
  fields:
  - gpus
  - debug
  - wandb
  - env
  - uid
  - local_rank
  - is_distributed
  - master_port
  - device_type
  - cluster
  - world_size
  - train_dataset
  - eval_datasets
  - user_cfg
  - rank
  - device
  - hf_access_token
  - hf_private
  - hf_repo
  - hf_user
  - hf_token
  - save_every
  - eval_steps
  - save_steps
  - upload_to_hf
  - logging
  - log_every
  - use_wandb
  - project_root
  - version
  postfix:
  - _path
  - _file
  - _dir
  - _alias
  - _prefix
  prefix:
  - _
add_special_tokens: true
alias: Gencode-MxDNA
arch: hnet
batch_size: 8
bidirectional_strategy: mean
cluster: mila
cmd: python src/scripts/rebuttal/train_mlm.py exp=rebuttal/mlm data=gencode_human_12.8k
  model=hnet/mamba_64m max_len=12800 batch_size=8 eval_batch_size=1 grad_acc_steps=4
  train_steps=7650 eval_steps=125 save_steps=750 log_every=2 num_valid_samples=3000
  upload_to_hf=true wandb.project=DNAFM_v2 tokenizer=mxdna alias=Gencode-MxDNA use_wandb=true
  hf_repo=jzshared/Gencode-MxDNA
config_path: null
data: gencode_human_12.8k
data_alias: ${.data}_${max_len}
dataset: ${_dataset_cfg_lookup[${data}]}
dataset_sequence_key: sequence
device: cuda
device_type: GPU
dirs:
  data_cache: ${project_root}/data_cache/
  data_storage: ${project_root}/data/
  hydra: ${project_root}/temp/hydra/
  output: ${project_root}/output/${data_alias}/${alias}/
  temp: ${project_root}/temp/working_dir/${uid}/
  wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/}
epochs: 200
eval_batch_size: 1
eval_steps: 125
grad_acc_steps: 4
hf_private: false
hf_repo: jzshared/Gencode-MxDNA
hf_user: jzshared
is_distributed: true
local_rank: 0
log_every: 2
logging:
  level: info
  log_wandb_metric_to_stdout: true
lr: 0.001
mask_replace_prob: 0.8
master_port: '46807'
max_data_samples: null
max_grad_norm: 2.0
max_len: 12800
max_length: ${max_len}
max_routing_tokens: 0
max_train_steps: ${train_steps}
min_routing_tokens: 8
mixed_precision: bf16
mlm_probability: 0.15
mode: Formal
model:
  arch: hnet
  name: hnet_mamba_64m
model_alias: ${oc.select:model.name,UnknownModel}
model_cfg:
  arch_layout:
  - m4
  - - m15
  - m4
  attn_cfg:
    num_heads:
    - 8
    - 12
    rotary_emb_dim:
    - 16
    - 24
    window_size:
    - 511
    - -1
  d_intermediate:
  - 0
  - 2048
  d_model:
  - 512
  - 768
  max_routing_tokens: ${max_routing_tokens}
  min_routing_tokens: ${min_routing_tokens}
  n_gpt: 1.0
  r_hi: ${r_hi}
  r_low: ${r_low}
  ssm_cfg:
    chunk_size: 256
    d_conv: 4
    d_state: 64
    expand: 2
    head_dim: 64
  tie_embeddings: true
  vocab_size: 9
mxdna_tokenizer_vocab_path: src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
name: hnet_base
num_test_samples: 0
num_train_samples: 0
num_valid_samples: 3000
project_root: ${hydra:runtime.cwd}
r_hi: 0.3
r_low: 0.0
random_replace_prob: 0.1
random_truncate: false
rank: 0
reference_loss: null
save_steps: 750
seed: 0
source: ${dataset.type}
tokenizer: mxdna
tokenizer_cache_dir: ${dirs.data_cache}/hf_tokenizers
tokenizer_max_length: null
tokenizer_name_or_path: null
tokenizer_name_or_path_resolved: /gpfs/scratch/guoh/DNAFM/src/scripts/rebuttal/assets/mxdna_1mertokenizer/vocab.txt
tokenizer_pad_to_multiple_of: null
tokenizer_trust_remote_code: false
tokenizer_use_fast: true
tokenizer_vocab_size: 9
train_steps: 7650
training:
  adam_beta1: 0.9
  adam_beta2: 0.95
  bf16: true
  dataloader_drop_last: true
  dataloader_num_workers: 1
  disable_tqdm: false
  do_train: true
  eval_steps: ${eval_steps}
  eval_strategy: steps
  gradient_accumulation_steps: ${grad_acc_steps}
  gradient_checkpointing: false
  group_by_length: false
  hnet_initializer_range: 0.02
  hnet_lr_multiplier: null
  label_names:
  - labels
  learning_rate: ${lr}
  logging_steps: ${log_every}
  lr_scheduler_type: linear
  max_grad_norm: ${max_grad_norm}
  max_train_steps: ${max_train_steps}
  num_train_epochs: ${epochs}
  output_dir: ${dirs.output}
  overrides: {}
  per_device_eval_batch_size: ${eval_batch_size}
  per_device_train_batch_size: ${batch_size}
  remove_unused_columns: false
  report_to: null
  resume_from_checkpoint: null
  save_steps: ${save_steps}
  save_strategy: steps
  use_lr_multiplier: true
  warmup_steps: 500
  weight_decay: 0.1
training_alias: mlm_${tokenizer}_lr${lr}_${train_steps}steps_ms${max_train_steps}_maxlen${max_len}
uid: ywrwxmjk
upload_to_hf: true
use_routing_ceiling: false
use_routing_floor: true
use_wandb: true
valid_test_downsample: null
version: NA
wandb:
  dir: ${dirs.wandb_cache}
  entity: ${oc.select:env.vars.wandb_entity,${oc.env:WANDB_ENTITY,null}}
  id: ywrwxmjk
  mode: online
  name: Gencode-MxDNA
  project: DNAFM_v2
  step_metric: null
  tags: []
  url: https://wandb.ai/jzshared/DNAFM_v2/runs/ywrwxmjk
world_size: 8