| { |
| "name": "debug", |
| "dump_dir": "/home/kieron/fyp/blt/tmp/blt-entropy", |
| "seed": 777, |
| "debug_dynamo": false, |
| "grad_acc_steps": 1, |
| "gc_collect_freq": 1000, |
| "probe_freq": null, |
| "steps": 3500, |
| "max_steps": null, |
| "data": { |
| "s3_profile": null, |
| "root_dir": "/home/kieron/fyp/data/mc4_SEA_1000000_sentences", |
| "sources": { |
| "combined": 1.0 |
| }, |
| "batch_size": 16, |
| "seq_len": 8192, |
| "seed": 42, |
| "add_bos": true, |
| "add_eos": true, |
| "load_async": true, |
| "async_persist_type": "exact", |
| "prefetch_size": 64, |
| "preprocess_dir": null, |
| "dataset_files": null, |
| "entropy_model_name": "transformer_100m", |
| "arrow_batch_size": 20, |
| "buffer_size": 64, |
| "file_format": "json", |
| "pad_to_max_length": true, |
| "max_encoder_seq_length": 8192, |
| "enable_byte_ngrams": false, |
| "add_patches": false, |
| "tokenizer_args": { |
| "name": "blt", |
| "init_kwargs": null |
| }, |
| "patcher_args": { |
| "patching_mode": "byte", |
| "patching_device": "cuda", |
| "entropy_model_checkpoint_dir": null, |
| "realtime_patching": false, |
| "threshold": 1.335442066192627, |
| "threshold_add": null, |
| "max_patch_length": null, |
| "patch_size": 4.5, |
| "patching_batch_size": 1, |
| "device": "cuda", |
| "monotonicity": false, |
| "log_time": false |
| } |
| }, |
| "optim": { |
| "lr": 0.0004, |
| "weight_decay": 0.1, |
| "epsilon": 1e-8, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "clip": 10.0, |
| "scheduler": "cosine", |
| "warmup": 500, |
| "lr_min_ratio": 0.1, |
| "cycle_length": 1.0, |
| "cosine_theta": 1.0, |
| "annealing_step": 1000, |
| "decay_fraction": 0.1, |
| "exp_factor": 0.5 |
| }, |
| "model": null, |
| "entropy_model": { |
| "dim": 512, |
| "n_layers": 14, |
| "head_dim": null, |
| "n_heads": 8, |
| "n_kv_heads": null, |
| "ffn_dim_multiplier": 1.0, |
| "multiple_of": 256, |
| "norm_eps": 0.00001, |
| "rope_theta": 10000.0, |
| "rope_use_fp32_in_outer_product": false, |
| "init_base_std": null, |
| "init_std_factor": "disabled", |
| "max_seqlen": 8192, |
| "attn_impl": "xformers", |
| "attn_bias_type": "local_block_causal", |
| "eos_id": 2, |
| "seed": 42, |
| "vocab_size": 260, |
| "weight_tying": false, |
| "sliding_window": 512 |
| }, |
| "train_entropy_model": true, |
| "distributed": { |
| "dp_shard": 1, |
| "dp_replicate": 4, |
| "tp_size": 1, |
| "selective_activation_checkpointing": false, |
| "compile": false, |
| "fsdp_type": "full_shard", |
| "model_dtype": "bf16", |
| "float8_recipe": null, |
| "float8_filter": "layers\\.[0-9]+\\.", |
| "matmul_allow_tf32": false, |
| "allow_bf16_reduced_precision_reduction": true, |
| "detect_anomaly": false, |
| "compile_cache_size_limit": 8, |
| "spawn_method": "forkserver" |
| }, |
| "env": { |
| "MKL_SERVICE_FORCE_INTEL": "GNU", |
| "OMP_NUM_THREADS": "1", |
| "MKL_NUM_THREADS": "1", |
| "ENABLE_INTRA_NODE_COMM": "1", |
| "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
| "NCCL_IB_TIMEOUT": "22", |
| "NCCL_DEBUG": "INFO", |
| "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" |
| }, |
| "checkpoint": { |
| "dump": { |
| "every": 10000, |
| "keep": 2 |
| }, |
| "eval": { |
| "every": 5000, |
| "keep": -1 |
| }, |
| "path": "/home/kieron/fyp/blt/tmp/blt-entropy/checkpoints", |
| "init_ckpt_path": null, |
| "continue_training_from_init": false, |
| "s3_profile": null |
| }, |
| "profiling": { |
| "run": false, |
| "trace_folder": "profiling", |
| "mem_warmup": 100, |
| "mem_steps": 2, |
| "profile_warmup": 102, |
| "profile_steps": 2 |
| }, |
| "logging": { |
| "freq": 100, |
| "acc_freq": null, |
| "wandb": null |
| }, |
| "async_eval_gpus": null, |
| "eval": null, |
| "eval_on_gpus": 4 |
| } |