craffel HF Staff commited on
Commit
7d5560f
·
verified ·
1 Parent(s): a3bc6dd

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -113,3 +113,12 @@ flexitok_superset_albert_w_xglm/0000100000/__5_0.distcp filter=lfs diff=lfs merg
113
  flexitok_superset_albert_w_xglm/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
114
  flexitok_superset_albert_w_xglm/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
115
  fineweb2_hq_superset_lang_tokenizers/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
113
  flexitok_superset_albert_w_xglm/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
114
  flexitok_superset_albert_w_xglm/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
115
  fineweb2_hq_superset_lang_tokenizers/metrics.jsonl filter=lfs diff=lfs merge=lfs -text
116
+ fineweb2_hq_superset_lang_tokenizers/0000100000/.metadata filter=lfs diff=lfs merge=lfs -text
117
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
118
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
119
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
120
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
121
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
122
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
123
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
124
+ fineweb2_hq_superset_lang_tokenizers/0000100000/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
fineweb2_hq_superset_lang_tokenizers/0000100000/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9581f91716ee5db5cfe9344768e33d460d2ffc6e04d2d49e9312a4ef328f28c8
3
+ size 1148574
fineweb2_hq_superset_lang_tokenizers/0000100000/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd5eb1a6f1f83f7918fc9b694c7d73edf1970073488d8b66f56c2b789873d678
3
+ size 2626712784
fineweb2_hq_superset_lang_tokenizers/0000100000/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc50ffa169a949febf3994829371ec243720d63e8409eca4ac16b690305ee463
3
+ size 2626751724
fineweb2_hq_superset_lang_tokenizers/0000100000/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f34bd0e2134d4d5902ac64faa2e550425c7dbaa95c103956e57ccc0379e1d2
3
+ size 2626751724
fineweb2_hq_superset_lang_tokenizers/0000100000/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb5052c8b29777ea0079261173847b9b94d69a9031a3910686e8c961ec30cf75
3
+ size 2626751724
fineweb2_hq_superset_lang_tokenizers/0000100000/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3de294b83b4b164a0b28c424b093cabfa09c36c791b95a6edc51d834e80ab2d5
3
+ size 2626751724
fineweb2_hq_superset_lang_tokenizers/0000100000/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45b40ec8e79aa400519071f3d389196bc63a412968a396fa7794efaa77afe5fc
3
+ size 2626754000
fineweb2_hq_superset_lang_tokenizers/0000100000/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f0896ef85b8d7a51ef7de391ce634952b2310b81495dc060b13fc9361d6fbee
3
+ size 2626754000
fineweb2_hq_superset_lang_tokenizers/0000100000/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ded272e20d51b120c6f18af811a9400dbb8a30fbd71e8d8ce0e5a322e51745b
3
+ size 2626565648
fineweb2_hq_superset_lang_tokenizers/0000100000/consolidated/consolidated.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e8b5c6f2d18c84e8e49b08b2555eb2d8a8626e2449454772cf52f555f3a654
3
+ size 21007336342
fineweb2_hq_superset_lang_tokenizers/0000100000/consolidated/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "flexitok_superset_lang_tokenizers", "dump_dir": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_lang_tokenizers", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 100000, "data": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "load_supermapping": false, "dropout": 0.0, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 113764, "weight_tying": false, "sliding_window": null, "use_factorized_embeddings": false, "factorized_embedding_dim": 0}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": -1}, "eval": {"every": 10000, "keep": -1}, "path": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_lang_tokenizers/checkpoints", "init_ckpt_path": null, "load_init_optimizer_state": false, "save_init_ckpt": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 8, "eval": {"harness": {"tasks": ["hellaswag", "piqa", "arc_easy", "arc_challenge", "include_base_44_arabic", "include_base_44_chinese", "include_base_44_german", "include_base_44_greek", "include_base_44_persian", "include_base_44_french", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_dutch", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_spanish", "include_base_44_turkish", "include_base_44_vietnamese", "belebele_arb_Arab", "belebele_ces_Latn", "belebele_zho_Hans", "belebele_dan_Latn", "belebele_deu_Latn", "belebele_ell_Grek", "belebele_pes_Arab", "belebele_fra_Latn", "belebele_hun_Latn", "belebele_ind_Latn", "belebele_ita_Latn", "belebele_jpn_Jpan", "belebele_nld_Latn", "belebele_pol_Latn", "belebele_por_Latn", "belebele_rus_Cyrl", "belebele_spa_Latn", "belebele_swe_Latn", "belebele_tur_Latn", "belebele_vie_Latn", "belebele_eng_Latn", "xnli_ar", "xnli_zh", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_tr", "xnli_vi"]}, "generator": {"max_tokens": 16384, "dtype": "bf16", "add_bos": false}}}
fineweb2_hq_superset_lang_tokenizers/0000100000/params.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "flexitok_superset_lang_tokenizers", "dump_dir": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_lang_tokenizers", "seed": 777, "grad_acc_steps": 8, "gc_collect_freq": 1000, "probe_freq": null, "steps": 100000, "data": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "batch_size": 4, "seq_len": 4096, "n_views": 2, "seed": 42, "add_bos": true, "add_eos": true, "load_async": true, "prefetch_size": 1024, "tokenizer": {"name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "load_supermapping": false, "dropout": 0.0, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}}, "optim": {"lr": 0.001, "weight_decay": 0.1, "epsilon": 1e-08, "beta1": 0.9, "beta2": 0.95, "clip": 1.0, "scheduler": "cosine", "warmup": 2000, "lr_min_ratio": 1e-06, "cycle_length": 1.0, "cosine_theta": 1.0, "annealing_step": 1000, "decay_fraction": 0.1, "exp_factor": 0.5}, "model": {"dim": 2048, "n_layers": 25, "head_dim": null, "n_heads": 16, "n_kv_heads": null, "ffn_dim_multiplier": null, "multiple_of": 256, "norm_eps": 1e-05, "rope_theta": 10000.0, "init_base_std": null, "init_std_factor": "disabled", "max_seqlen": 4096, "seed": 42, "vocab_size": 113764, "weight_tying": false, "sliding_window": null, "use_factorized_embeddings": false, "factorized_embedding_dim": 0}, "distributed": {"dp_shard": 1, "dp_replicate": 8, "tp_size": 1, "selective_activation_checkpointing": false, "compile": true, "fsdp_type": "full_shard", "model_dtype": "bf16", "float8_recipe": null, "float8_filter": "layers\\.[0-9]+\\.", "matmul_allow_tf32": false, "detect_anomaly": false, "compile_cache_size_limit": 8, "spawn_method": "forkserver"}, "env": {"MKL_SERVICE_FORCE_INTEL": "GNU", "OMP_NUM_THREADS": "1", "MKL_NUM_THREADS": "1", "ENABLE_INTRA_NODE_COMM": "1", "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", "NCCL_IB_TIMEOUT": "22", "NCCL_DEBUG": "INFO", "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"}, "checkpoint": {"dump": {"every": 10000, "keep": -1}, "eval": {"every": 10000, "keep": -1}, "path": "/fsx/craffel/lingua_logs/fineweb2_hq_superset_lang_tokenizers/checkpoints", "init_ckpt_path": null, "load_init_optimizer_state": false, "save_init_ckpt": false}, "profiling": {"run": true, "trace_folder": "profiling", "mem_warmup": 0, "mem_steps": 4, "profile_warmup": 100, "profile_steps": 4}, "logging": {"freq": 1, "acc_freq": null, "wandb": null}, "async_eval_gpus": 8, "eval": {"harness": {"tasks": ["hellaswag", "piqa", "arc_easy", "arc_challenge", "include_base_44_arabic", "include_base_44_chinese", "include_base_44_german", "include_base_44_greek", "include_base_44_persian", "include_base_44_french", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_dutch", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_spanish", "include_base_44_turkish", "include_base_44_vietnamese", "belebele_arb_Arab", "belebele_ces_Latn", "belebele_zho_Hans", "belebele_dan_Latn", "belebele_deu_Latn", "belebele_ell_Grek", "belebele_pes_Arab", "belebele_fra_Latn", "belebele_hun_Latn", "belebele_ind_Latn", "belebele_ita_Latn", "belebele_jpn_Jpan", "belebele_nld_Latn", "belebele_pol_Latn", "belebele_por_Latn", "belebele_rus_Cyrl", "belebele_spa_Latn", "belebele_swe_Latn", "belebele_tur_Latn", "belebele_vie_Latn", "belebele_eng_Latn", "xnli_ar", "xnli_zh", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_tr", "xnli_vi"]}, "generator": {"max_tokens": 16384, "dtype": "bf16", "add_bos": false}}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00000.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 3977, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.00.jsonl", "position": 11332386417, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.00.jsonl", "position": 519092778, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.00.jsonl", "position": 491320683, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.00.jsonl", "position": 524319188, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.00.jsonl", "position": 696090070, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.00.jsonl", "position": 587312333, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.00.jsonl", "position": 441906659, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.00.jsonl", "position": 681103317, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.00.jsonl", "position": 760263563, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.00.jsonl", "position": 948470392, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.00.jsonl", "position": 674083305, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.00.jsonl", "position": 538697616, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.00.jsonl", "position": 566420044, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.00.jsonl", "position": 587922462, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.00.jsonl", "position": 657337469, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.00.jsonl", "position": 686800435, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.00.jsonl", "position": 909477982, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.00.jsonl", "position": 813871044, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.00.jsonl", "position": 994004647, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.00.jsonl", "position": 1056125980, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.00.jsonl", "position": 2475797266, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 230612475032513390196519612138757022735, "inc": 252101603063402394885084957393789173453}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 230333890073491887148512731646953067794, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 269702842}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 300569569296678341640414112158566886944, "inc": 257317082376085721142933171929815648017}, "has_uint32": 1, "uinteger": 630457105}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00001.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1313, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.01.jsonl", "position": 11372963130, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.01.jsonl", "position": 519588872, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.01.jsonl", "position": 492854893, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.01.jsonl", "position": 522443795, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.01.jsonl", "position": 691437428, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.01.jsonl", "position": 585325295, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.01.jsonl", "position": 438787745, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.01.jsonl", "position": 684909926, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.01.jsonl", "position": 770172870, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.01.jsonl", "position": 958645851, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.01.jsonl", "position": 674350926, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.01.jsonl", "position": 536108778, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.01.jsonl", "position": 571764820, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.01.jsonl", "position": 588024177, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.01.jsonl", "position": 650395369, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.01.jsonl", "position": 686266939, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.01.jsonl", "position": 914901162, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.01.jsonl", "position": 827630003, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.01.jsonl", "position": 989675330, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.01.jsonl", "position": 1052591070, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.01.jsonl", "position": 2449328114, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 133968813370586321652408302927267634225, "inc": 246509925186285949978196491240064802315}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 334342595775114840988952963472251012693, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 3678022743}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 298609680158430271867266436931522339460, "inc": 173555323965545256606922338259303677603}, "has_uint32": 1, "uinteger": 1182378492}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00002.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 420, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.02.jsonl", "position": 11383114399, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.02.jsonl", "position": 522469629, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.02.jsonl", "position": 485635791, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.02.jsonl", "position": 521983407, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.02.jsonl", "position": 695309955, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.02.jsonl", "position": 579934320, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.02.jsonl", "position": 442197623, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.02.jsonl", "position": 678744816, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.02.jsonl", "position": 770844291, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.02.jsonl", "position": 941835658, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.02.jsonl", "position": 676172817, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.02.jsonl", "position": 541629871, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.02.jsonl", "position": 564135258, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.02.jsonl", "position": 588481521, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.02.jsonl", "position": 654992024, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.02.jsonl", "position": 690407469, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.02.jsonl", "position": 914700252, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.02.jsonl", "position": 814542366, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.02.jsonl", "position": 995536190, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.02.jsonl", "position": 1048800700, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.02.jsonl", "position": 2468688937, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 56295390063826420216911862687271462270, "inc": 234358335530849485425064040311006256713}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 97043016590011472558194533793952406301, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 847341261}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 172206642884315098528897268843500314535, "inc": 319170006889470250209362588441616495209}, "has_uint32": 1, "uinteger": 2183176397}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00003.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 491, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.03.jsonl", "position": 11325049679, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.03.jsonl", "position": 516109212, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.03.jsonl", "position": 483437605, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.03.jsonl", "position": 525165571, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.03.jsonl", "position": 694085919, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.03.jsonl", "position": 575848802, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.03.jsonl", "position": 448698824, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.03.jsonl", "position": 678597110, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.03.jsonl", "position": 768075911, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.03.jsonl", "position": 956309135, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.03.jsonl", "position": 669917107, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.03.jsonl", "position": 544401328, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.03.jsonl", "position": 560403057, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.03.jsonl", "position": 583847913, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.03.jsonl", "position": 656186520, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.03.jsonl", "position": 681322222, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.03.jsonl", "position": 914607626, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.03.jsonl", "position": 825041671, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.03.jsonl", "position": 998019475, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.03.jsonl", "position": 1045000513, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.03.jsonl", "position": 2497697818, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 210618130920149505244984849848612552299, "inc": 148211758571781046255077612135386035203}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 18651544543914873453529996365398441002, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 2088473259}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 54691439170624420224489263842165716902, "inc": 115810872492597857501795428972873905393}, "has_uint32": 1, "uinteger": 1237320779}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00004.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1183, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.04.jsonl", "position": 11332591211, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.04.jsonl", "position": 525712774, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.04.jsonl", "position": 487909241, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.04.jsonl", "position": 523146782, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.04.jsonl", "position": 694630823, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.04.jsonl", "position": 580686126, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.04.jsonl", "position": 444267870, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.04.jsonl", "position": 685645271, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.04.jsonl", "position": 766680026, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.04.jsonl", "position": 962271147, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.04.jsonl", "position": 668658671, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.04.jsonl", "position": 542330979, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.04.jsonl", "position": 567361928, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.04.jsonl", "position": 592030938, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.04.jsonl", "position": 651408386, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.04.jsonl", "position": 696158052, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.04.jsonl", "position": 916915450, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.04.jsonl", "position": 817502742, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.04.jsonl", "position": 987989339, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.04.jsonl", "position": 1052257313, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.04.jsonl", "position": 2459770850, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 3598758105482535893984038932740196087, "inc": 186633262021180533256729114674950595327}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 314306568961318947316649307630591802769, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 2753265584}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 330736315247521707224292038935126153445, "inc": 303111205818808944921858206842105131807}, "has_uint32": 1, "uinteger": 211256137}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00005.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 1780, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.05.jsonl", "position": 11347432620, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.05.jsonl", "position": 519264013, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.05.jsonl", "position": 485669741, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.05.jsonl", "position": 527267611, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.05.jsonl", "position": 693361348, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.05.jsonl", "position": 579299828, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.05.jsonl", "position": 439464535, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.05.jsonl", "position": 679967861, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.05.jsonl", "position": 774619934, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.05.jsonl", "position": 957084751, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.05.jsonl", "position": 678323787, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.05.jsonl", "position": 536022347, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.05.jsonl", "position": 572699474, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.05.jsonl", "position": 585976598, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.05.jsonl", "position": 653574516, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.05.jsonl", "position": 688532021, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.05.jsonl", "position": 915105214, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.05.jsonl", "position": 824438083, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.05.jsonl", "position": 990849934, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.05.jsonl", "position": 1047523136, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.05.jsonl", "position": 2458330353, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 270881788040608490194317615548038457755, "inc": 329233669073478483697346584247981015037}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 195464240413682888639473904508383018964, "inc": 332724090758049132448979897138935081983}, "has_uint32": 1, "uinteger": 519955542}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 152527936818944850177382785176910810558, "inc": 47382953940698287647753879262736142901}, "has_uint32": 1, "uinteger": 3675784275}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00006.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 652, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.06.jsonl", "position": 11357863103, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.06.jsonl", "position": 526362743, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.06.jsonl", "position": 492011166, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.06.jsonl", "position": 519856774, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.06.jsonl", "position": 693927002, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.06.jsonl", "position": 581355386, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.06.jsonl", "position": 446133428, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.06.jsonl", "position": 684189574, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.06.jsonl", "position": 769829605, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.06.jsonl", "position": 956807966, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.06.jsonl", "position": 682042781, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.06.jsonl", "position": 533657133, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.06.jsonl", "position": 566289777, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.06.jsonl", "position": 587831641, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.06.jsonl", "position": 656828475, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.06.jsonl", "position": 684992246, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.06.jsonl", "position": 911324697, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.06.jsonl", "position": 818884506, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.06.jsonl", "position": 994555212, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.06.jsonl", "position": 1052463619, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.06.jsonl", "position": 2452844448, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 151765474274408922443398723870727375197, "inc": 95963489890761403814531195999220475639}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 49112411000072676943166832600275581639, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 2458760163}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 131915213571014219328634582687476804074, "inc": 72545526324180839152750112646078969085}, "has_uint32": 0, "uinteger": 2783644522}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}
fineweb2_hq_superset_lang_tokenizers/0000100000/train_state_00007.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 100000, "acc_step": 0, "data_loader_state": {"it_state": {"start_token": 18163, "it_state": {"it_state": {"root_dir": "/scratch/craffel/lingua/data/flexitok/", "sources": {"fw_edu": 0.4, "dan_Latn": 0.0216582869670702, "swe_Latn": 0.0216359765418466, "vie_Latn": 0.0197485510268674, "hun_Latn": 0.0247194573562308, "fas_Arab": 0.0205634624231076, "tur_Latn": 0.0235455794841729, "ces_Latn": 0.0248024455266208, "arb_Arab": 0.0234323706569333, "ell_Grek": 0.0233670886888026, "ind_Latn": 0.0269322054593488, "nld_Latn": 0.0277796326621489, "pol_Latn": 0.0294120104572311, "por_Latn": 0.0301413168306825, "ita_Latn": 0.0324056371021865, "jpn_Jpan": 0.03553104151369, "fra_Latn": 0.0381835560678536, "spa_Latn": 0.0387222793083669, "deu_Latn": 0.0419925340453022, "cmn_Hani": 0.0454067521384114, "rus_Cyrl": 0.0500198157431261}, "source_to_state": {"fw_edu": {"file_path": "/scratch/craffel/lingua/data/flexitok/fw_edu/fineweb_edu_100bt.chunk.07.jsonl", "position": 11346253721, "block_size": 1, "offset": 0, "current_iter": 0}, "dan_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/dan_Latn/fineweb_2_hq.dan_Latn.chunk.07.jsonl", "position": 523755382, "block_size": 1, "offset": 0, "current_iter": 0}, "swe_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/swe_Latn/fineweb_2_hq.swe_Latn.chunk.07.jsonl", "position": 492596254, "block_size": 1, "offset": 0, "current_iter": 0}, "vie_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/vie_Latn/fineweb_2_hq.vie_Latn.chunk.07.jsonl", "position": 524926995, "block_size": 1, "offset": 0, "current_iter": 0}, "hun_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/hun_Latn/fineweb_2_hq.hun_Latn.chunk.07.jsonl", "position": 691151624, "block_size": 1, "offset": 0, "current_iter": 0}, "fas_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/fas_Arab/fineweb_2_hq.fas_Arab.chunk.07.jsonl", "position": 585569182, "block_size": 1, "offset": 0, "current_iter": 0}, "tur_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/tur_Latn/fineweb_2_hq.tur_Latn.chunk.07.jsonl", "position": 445768161, "block_size": 1, "offset": 0, "current_iter": 0}, "ces_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ces_Latn/fineweb_2_hq.ces_Latn.chunk.07.jsonl", "position": 676262491, "block_size": 1, "offset": 0, "current_iter": 0}, "arb_Arab": {"file_path": "/scratch/craffel/lingua/data/flexitok/arb_Arab/fineweb_2_hq.arb_Arab.chunk.07.jsonl", "position": 767375917, "block_size": 1, "offset": 0, "current_iter": 0}, "ell_Grek": {"file_path": "/scratch/craffel/lingua/data/flexitok/ell_Grek/fineweb_2_hq.ell_Grek.chunk.07.jsonl", "position": 957696312, "block_size": 1, "offset": 0, "current_iter": 0}, "ind_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ind_Latn/fineweb_2_hq.ind_Latn.chunk.07.jsonl", "position": 671221092, "block_size": 1, "offset": 0, "current_iter": 0}, "nld_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/nld_Latn/fineweb_2_hq.nld_Latn.chunk.07.jsonl", "position": 541646266, "block_size": 1, "offset": 0, "current_iter": 0}, "pol_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/pol_Latn/fineweb_2_hq.pol_Latn.chunk.07.jsonl", "position": 572167055, "block_size": 1, "offset": 0, "current_iter": 0}, "por_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/por_Latn/fineweb_2_hq.por_Latn.chunk.07.jsonl", "position": 583188043, "block_size": 1, "offset": 0, "current_iter": 0}, "ita_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/ita_Latn/fineweb_2_hq.ita_Latn.chunk.07.jsonl", "position": 652626879, "block_size": 1, "offset": 0, "current_iter": 0}, "jpn_Jpan": {"file_path": "/scratch/craffel/lingua/data/flexitok/jpn_Jpan/fineweb_2_hq.jpn_Jpan.chunk.07.jsonl", "position": 689129651, "block_size": 1, "offset": 0, "current_iter": 0}, "fra_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/fra_Latn/fineweb_2_hq.fra_Latn.chunk.07.jsonl", "position": 915409089, "block_size": 1, "offset": 0, "current_iter": 0}, "spa_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/spa_Latn/fineweb_2_hq.spa_Latn.chunk.07.jsonl", "position": 823620132, "block_size": 1, "offset": 0, "current_iter": 0}, "deu_Latn": {"file_path": "/scratch/craffel/lingua/data/flexitok/deu_Latn/fineweb_2_hq.deu_Latn.chunk.07.jsonl", "position": 987237319, "block_size": 1, "offset": 0, "current_iter": 0}, "cmn_Hani": {"file_path": "/scratch/craffel/lingua/data/flexitok/cmn_Hani/fineweb_2_hq.cmn_Hani.chunk.07.jsonl", "position": 1055544549, "block_size": 1, "offset": 0, "current_iter": 0}, "rus_Cyrl": {"file_path": "/scratch/craffel/lingua/data/flexitok/rus_Cyrl/fineweb_2_hq.rus_Cyrl.chunk.07.jsonl", "position": 2455732657, "block_size": 1, "offset": 0, "current_iter": 0}}, "rng_state": {"bit_generator": "PCG64", "state": {"state": 221574967649918375740080961099707042490, "inc": 53245743019587277358203950863334653629}, "has_uint32": 0, "uinteger": 0}}, "add_bos": true, "add_eos": true, "name": "supertokenizer", "path": "meta-llama/Llama-3.2-1B", "tokenizers": [{"name": "huggingface", "path": "flexitok/bpe_arb_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ces_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_cmn_Hani_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_dan_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_deu_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ell_Grek_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fas_Arab_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fra_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_fw_edu_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_hun_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ind_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_ita_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_jpn_Jpan_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_nld_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_pol_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_por_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_rus_Cyrl_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_spa_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_swe_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_tur_Latn_8000", "load_supermapping": true}, {"name": "huggingface", "path": "flexitok/bpe_vie_Latn_8000", "load_supermapping": true}], "dropout": 0.0, "rng_state": {"bit_generator": "PCG64", "state": {"state": 284713615769171662415174361143730162695, "inc": 332724090758049132448979897138935081983}, "has_uint32": 0, "uinteger": 2207494559}, "seed": 42, "superset_code_name": "fineweb2_hq", "n_words": 113764}, "output_seq_len": 4096, "n_views": 2}, "seq_idx": 256, "rng_state": {"bit_generator": "PCG64", "state": {"state": 239803483826676955776584746976189400951, "inc": 19761753544780285878460645500694854795}, "has_uint32": 1, "uinteger": 513301027}, "batch_size": 4, "prefetch_size": 1024}, "scheduler": {"base_lrs": [0.001], "last_epoch": 100000, "verbose": false, "_step_count": 100001, "_get_lr_called_within_step": false, "_last_lr": [0.001], "lr_lambdas": [{}]}}