name: flexitok_llama
dump_dir: /scratch/gsa/flexitok/init_models
seed: 777
grad_acc_steps: 8
gc_collect_freq: 1000
probe_freq: null
steps: 100000
data:
  root_dir: /scratch/craffel/lingua/data/flexitok/
  sources:
    fw_edu: 0.4
    dan_Latn: 0.0216582869670702
    swe_Latn: 0.0216359765418466
    vie_Latn: 0.0197485510268674
    hun_Latn: 0.0247194573562308
    fas_Arab: 0.0205634624231076
    tur_Latn: 0.0235455794841729
    ces_Latn: 0.0248024455266208
    arb_Arab: 0.0234323706569333
    ell_Grek: 0.0233670886888026
    ind_Latn: 0.0269322054593488
    nld_Latn: 0.0277796326621489
    pol_Latn: 0.0294120104572311
    por_Latn: 0.0301413168306825
    ita_Latn: 0.0324056371021865
    jpn_Jpan: 0.03553104151369
    fra_Latn: 0.0381835560678536
    spa_Latn: 0.0387222793083669
    deu_Latn: 0.0419925340453022
    cmn_Hani: 0.0454067521384114
    rus_Cyrl: 0.0500198157431261
  batch_size: 4
  seq_len: 4096
  n_views: 2
  seed: 42
  add_bos: true
  add_eos: true
  load_async: true
  prefetch_size: 1024
  tokenizer:
    name: huggingface
    path: meta-llama/Llama-3.2-1B
    tokenizers: null
    load_supermapping: false
    dropout: 0.0
    seed: 42
    superset_code_name: super_vocab
    n_words: null
  routing:
    source_to_tokenizer: {}
    task_to_tokenizer: {}
    suitable_tokenizer_probability: 1.0
optim:
  lr: 0.001
  weight_decay: 0.1
  epsilon: 1.0e-08
  beta1: 0.9
  beta2: 0.95
  clip: 1.0
  scheduler: cosine
  warmup: 2000
  lr_min_ratio: 1.0e-06
  cycle_length: 1.0
  cosine_theta: 1.0
  annealing_step: 1000
  decay_fraction: 0.1
  exp_factor: 0.5
model:
  dim: 2048
  n_layers: 25
  head_dim: null
  n_heads: 16
  n_kv_heads: null
  ffn_dim_multiplier: null
  multiple_of: 256
  norm_eps: 1.0e-05
  rope_theta: 10000.0
  init_base_std: null
  init_std_factor: disabled
  max_seqlen: 4096
  seed: 42
  vocab_size: 128256
  weight_tying: false
  sliding_window: null
  use_factorized_embeddings: false
  factorized_embedding_dim: 0
distributed:
  dp_shard: 1
  dp_replicate: 1
  tp_size: 1
  selective_activation_checkpointing: false
  compile: true
  fsdp_type: full_shard
  model_dtype: bf16
  float8_recipe: null
  float8_filter: layers\.[0-9]+\.
  matmul_allow_tf32: false
  detect_anomaly: false
  compile_cache_size_limit: 8
  spawn_method: forkserver
env:
  MKL_SERVICE_FORCE_INTEL: GNU
  OMP_NUM_THREADS: '1'
  MKL_NUM_THREADS: '1'
  ENABLE_INTRA_NODE_COMM: '1'
  TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
  NCCL_IB_TIMEOUT: '22'
  NCCL_DEBUG: INFO
  TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
  dump:
    every: 10000
    keep: -1
  eval:
    every: 10000
    keep: -1
  path: null
  init_ckpt_path: null
  load_init_optimizer_state: false
  save_init_ckpt: false
profiling:
  run: true
  trace_folder: profiling
  mem_warmup: 0
  mem_steps: 4
  profile_warmup: 100
  profile_steps: 4
logging:
  freq: 1
  acc_freq: null
  wandb: null
async_eval_gpus: 8
eval:
  harness:
    tasks:
    - hellaswag
    - xnli_vi
  generator:
    max_tokens: 16384
    dtype: bf16
    add_bos: false