name: script_1 dump_dir: /fsx/craffel/lingua_logs/script_1 seed: 777 grad_acc_steps: 8 gc_collect_freq: 1000 probe_freq: null steps: 100000 data: root_dir: /scratch/craffel/lingua/data/flexitok/ sources: fw_edu: 0.4 dan_Latn: 0.0216582869670702 swe_Latn: 0.0216359765418466 vie_Latn: 0.0197485510268674 hun_Latn: 0.0247194573562308 fas_Arab: 0.0205634624231076 tur_Latn: 0.0235455794841729 ces_Latn: 0.0248024455266208 arb_Arab: 0.0234323706569333 ell_Grek: 0.0233670886888026 ind_Latn: 0.0269322054593488 nld_Latn: 0.0277796326621489 pol_Latn: 0.0294120104572311 por_Latn: 0.0301413168306825 ita_Latn: 0.0324056371021865 jpn_Jpan: 0.03553104151369 fra_Latn: 0.0381835560678536 spa_Latn: 0.0387222793083669 deu_Latn: 0.0419925340453022 cmn_Hani: 0.0454067521384114 rus_Cyrl: 0.0500198157431261 batch_size: 4 seq_len: 4096 n_views: 2 seed: 42 add_bos: true add_eos: true load_async: true prefetch_size: 1024 tokenizer: name: supertokenizer path: meta-llama/Llama-3.2-1B tokenizers: - name: huggingface path: flexitok/bpe_script_Arab_16000 load_supermapping: true - name: huggingface path: flexitok/bpe_script_CmJp_16000 load_supermapping: true - name: huggingface path: flexitok/bpe_ltr_ell_Grek_8000_v2 load_supermapping: true - name: huggingface path: flexitok/bpe_ltr_fw_edu_32000_v2 load_supermapping: true - name: huggingface path: flexitok/bpe_ltr_hun_Latn_8000_v2 load_supermapping: true - name: huggingface path: flexitok/bpe_ltr_rus_Cyrl_16000_v2 load_supermapping: true - name: huggingface path: flexitok/bpe_ltr_tur_Latn_8000_v2 load_supermapping: true - name: huggingface path: flexitok/bpe_script_Germ_32000 load_supermapping: true - name: huggingface path: flexitok/bpe_script_Roma_32000 load_supermapping: true - name: huggingface path: flexitok/bpe_script_SEAS_16000 load_supermapping: true - name: huggingface path: flexitok/bpe_script_Slav_16000 load_supermapping: true load_supermapping: false dropout: 0.0 seed: 42 superset_code_name: script_1 n_words: 165022 routing: source_to_tokenizer: arb_Arab: flexitok/bpe_script_Arab_16000 fas_Arab: flexitok/bpe_script_Arab_16000 cmn_Hani: flexitok/bpe_script_CmJp_16000 jpn_Jpan: flexitok/bpe_script_CmJp_16000 ell_Grek: flexitok/bpe_ltr_ell_Grek_8000_v2 fw_edu: flexitok/bpe_ltr_fw_edu_32000_v2 hun_Latn: flexitok/bpe_ltr_hun_Latn_8000_v2 rus_Cyrl: flexitok/bpe_ltr_rus_Cyrl_16000_v2 tur_Latn: flexitok/bpe_ltr_tur_Latn_8000_v2 dan_Latn: flexitok/bpe_script_Germ_32000 deu_Latn: flexitok/bpe_script_Germ_32000 nld_Latn: flexitok/bpe_script_Germ_32000 swe_Latn: flexitok/bpe_script_Germ_32000 fra_Latn: flexitok/bpe_script_Roma_32000 ita_Latn: flexitok/bpe_script_Roma_32000 por_Latn: flexitok/bpe_script_Roma_32000 spa_Latn: flexitok/bpe_script_Roma_32000 ind_Latn: flexitok/bpe_script_SEAS_16000 vie_Latn: flexitok/bpe_script_SEAS_16000 ces_Latn: flexitok/bpe_script_Slav_16000 pol_Latn: flexitok/bpe_script_Slav_16000 task_to_tokenizer: {} suitable_tokenizer_probability: 0.9 optim: lr: 0.001 weight_decay: 0.1 epsilon: 1.0e-08 beta1: 0.9 beta2: 0.95 clip: 1.0 scheduler: cosine warmup: 2000 lr_min_ratio: 1.0e-06 cycle_length: 1.0 cosine_theta: 1.0 annealing_step: 1000 decay_fraction: 0.1 exp_factor: 0.5 model: dim: 2048 n_layers: 25 head_dim: null n_heads: 16 n_kv_heads: null ffn_dim_multiplier: null multiple_of: 256 norm_eps: 1.0e-05 rope_theta: 10000.0 init_base_std: null init_std_factor: disabled max_seqlen: 4096 seed: 42 vocab_size: 165022 weight_tying: false sliding_window: null use_factorized_embeddings: false factorized_embedding_dim: 0 distributed: dp_shard: 1 dp_replicate: 8 tp_size: 1 selective_activation_checkpointing: false compile: true fsdp_type: full_shard model_dtype: bf16 float8_recipe: null float8_filter: layers\.[0-9]+\. matmul_allow_tf32: false detect_anomaly: false compile_cache_size_limit: 8 spawn_method: forkserver env: MKL_SERVICE_FORCE_INTEL: GNU OMP_NUM_THREADS: '1' MKL_NUM_THREADS: '1' ENABLE_INTRA_NODE_COMM: '1' TORCH_NCCL_AVOID_RECORD_STREAMS: '1' NCCL_IB_TIMEOUT: '22' NCCL_DEBUG: INFO TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' checkpoint: dump: every: 10000 keep: -1 eval: every: 10000 keep: -1 path: /fsx/craffel/lingua_logs/checkpoints/script_1 init_ckpt_path: null load_init_optimizer_state: false save_init_ckpt: false profiling: run: true trace_folder: profiling mem_warmup: 0 mem_steps: 4 profile_warmup: 100 profile_steps: 4 logging: freq: 1 acc_freq: null wandb: null async_eval_gpus: 8 eval: harness: tasks: - hellaswag - piqa - arc_easy - arc_challenge - include_base_44_arabic - include_base_44_chinese - include_base_44_german - include_base_44_greek - include_base_44_persian - include_base_44_french - include_base_44_hungarian - include_base_44_indonesian - include_base_44_italian - include_base_44_japanese - include_base_44_dutch - include_base_44_polish - include_base_44_portuguese - include_base_44_russian - include_base_44_spanish - include_base_44_turkish - include_base_44_vietnamese - belebele_arb_Arab - belebele_ces_Latn - belebele_zho_Hans - belebele_dan_Latn - belebele_deu_Latn - belebele_ell_Grek - belebele_pes_Arab - belebele_fra_Latn - belebele_hun_Latn - belebele_ind_Latn - belebele_ita_Latn - belebele_jpn_Jpan - belebele_nld_Latn - belebele_pol_Latn - belebele_por_Latn - belebele_rus_Cyrl - belebele_spa_Latn - belebele_swe_Latn - belebele_tur_Latn - belebele_vie_Latn - belebele_eng_Latn - xnli_ar - xnli_zh - xnli_de - xnli_el - xnli_en - xnli_es - xnli_fr - xnli_ru - xnli_tr - xnli_vi generator: max_tokens: 16384 dtype: bf16 add_bos: false