| {"name":"debug","dump_dir":"/home/kieron/fyp/blt/tmp/blt-entropy","seed":777,"debug_dynamo":false,"grad_acc_steps":1,"gc_collect_freq":1000,"probe_freq":null,"steps":100000,"max_steps":null,"data":{"s3_profile":null,"root_dir":"/home/kieron/fyp/data/mc4_SEA_1000000_sentences","sources":{"combined":1.0},"batch_size":16,"seq_len":8192,"seed":42,"add_bos":true,"add_eos":true,"load_async":true,"async_persist_type":"exact","prefetch_size":64,"preprocess_dir":null,"dataset_files":null,"entropy_model_name":"transformer_100m","arrow_batch_size":20,"buffer_size":64,"file_format":"json","pad_to_max_length":true,"max_encoder_seq_length":8192,"enable_byte_ngrams":false,"add_patches":false,"tokenizer_args":{"name":"blt","init_kwargs":null},"patcher_args":{"patching_mode":"byte","patching_device":"cuda","entropy_model_checkpoint_dir":null,"realtime_patching":false,"threshold":1.335442066192627,"threshold_add":null,"max_patch_length":null,"patch_size":4.5,"patching_batch_size":1,"device":"cuda","monotonicity":false,"log_time":false}},"optim":{"lr":0.0004,"weight_decay":0.1,"epsilon":1e-8,"beta1":0.9,"beta2":0.95,"clip":10.0,"scheduler":"cosine","warmup":500,"lr_min_ratio":0.1,"cycle_length":1.0,"cosine_theta":1.0,"annealing_step":1000,"decay_fraction":0.1,"exp_factor":0.5},"model":null,"entropy_model":{"dim":512,"n_layers":14,"head_dim":null,"n_heads":8,"n_kv_heads":null,"ffn_dim_multiplier":1.0,"multiple_of":256,"norm_eps":0.00001,"rope_theta":10000.0,"rope_use_fp32_in_outer_product":false,"init_base_std":null,"init_std_factor":"disabled","max_seqlen":8192,"attn_impl":"xformers","attn_bias_type":"local_block_causal","eos_id":2,"seed":42,"vocab_size":260,"weight_tying":false,"sliding_window":512},"train_entropy_model":true,"distributed":{"dp_shard":1,"dp_replicate":4,"tp_size":1,"selective_activation_checkpointing":false,"compile":false,"fsdp_type":"full_shard","model_dtype":"bf16","float8_recipe":null,"float8_filter":"layers\\.[0-9]+\\.","matmul_allow_tf32":false,"allow_bf16_reduced_precision_reduction":true,"detect_anomaly":false,"compile_cache_size_limit":8,"spawn_method":"forkserver"},"env":{"MKL_SERVICE_FORCE_INTEL":"GNU","OMP_NUM_THREADS":"1","MKL_NUM_THREADS":"1","ENABLE_INTRA_NODE_COMM":"1","TORCH_NCCL_AVOID_RECORD_STREAMS":"1","NCCL_IB_TIMEOUT":"22","NCCL_DEBUG":"INFO","TORCH_NCCL_ASYNC_ERROR_HANDLING":"1"},"checkpoint":{"dump":{"every":10000,"keep":2},"eval":{"every":5000,"keep":-1},"path":"/home/kieron/fyp/blt/tmp/blt-entropy/checkpoints","init_ckpt_path":null,"continue_training_from_init":false,"s3_profile":null},"profiling":{"run":false,"trace_folder":"profiling","mem_warmup":100,"mem_steps":2,"profile_warmup":102,"profile_steps":2},"logging":{"freq":100,"acc_freq":null,"wandb":null},"async_eval_gpus":null,"eval":null,"eval_on_gpus":4} |