| { | |
| "adaptive_mixing_args": null, | |
| "async_checkpointing": false, | |
| "async_eval_ngpus": -1, | |
| "batch_size": 2, | |
| "checkpoint_manifold_bucket": "genai_llm_fb", | |
| "data": "", | |
| "delete_manifold_checkpoints": true, | |
| "disable_logging": false, | |
| "disable_workers_print": false, | |
| "dist": { | |
| "global_rank": 0, | |
| "world_size": 8 | |
| }, | |
| "do_sync_eval": true, | |
| "dtype": "bf16", | |
| "dump_dir": "/tmp/metaformers_dmp", | |
| "dump_freq": 100, | |
| "dump_profile_traces": false, | |
| "enable_gil_watcher": false, | |
| "enable_loss_tracker": false, | |
| "eval": null, | |
| "eval_freq": 100, | |
| "exp_id": "", | |
| "exp_name": "", | |
| "finetuning_checkpoint_load_strict": false, | |
| "finetuning_dir": "/tmp/metaformers_dmp/checkpoints/stable/llama_cinnamon_7b", | |
| "fp32_reduce_scatter": true, | |
| "gpu_check_level": 3, | |
| "hive_data": null, | |
| "instruct": { | |
| "is_instruct_tuning": true, | |
| "no_loss_prompt": true, | |
| "no_loss_truncated": false, | |
| "only_sft_last_response": false, | |
| "smart_coalesce": false, | |
| "space_around_response": false, | |
| "wrap_seq_tokens_once": false | |
| }, | |
| "instruct_data": "/tmp/metaformers_dmp/data/sft/anthropic_prompts_open_13K_desc_1122_redist_6cat:0.25,/tmp/metaformers_dmp/data/sft/anthropic_prompts_open_13K_no_desc_1122_redist_6cat:0.25,/tmp/metaformers_dmp/data/sft/anthropic_responses_open_13K_desc_1122_redist_6cat:0.25,/tmp/metaformers_dmp/data/sft/anthropic_responses_open_13K_no_desc_1122_redist_6cat:0.25", | |
| "iter_batch_multi_hive_koski": null, | |
| "iter_jsonl": { | |
| "buffer_size": 64, | |
| "same_data": false | |
| }, | |
| "iter_multi": { | |
| "buffer_size": 512, | |
| "ignore_extra_chunks": true, | |
| "max_precompute": 20, | |
| "multiprocess": true | |
| }, | |
| "iter_type": "multi", | |
| "kd_args": { | |
| "kd_logits": false, | |
| "kd_model": null, | |
| "kd_model_dir": "", | |
| "reverse_kld_loss": false | |
| }, | |
| "keep_eval_checkpoints": true, | |
| "keep_n_last_checkpoints": 2, | |
| "log_all_steps": false, | |
| "log_freq": 1, | |
| "log_updates": true, | |
| "loss_rescaling": false, | |
| "manifold_output_dir": "tree/checkpoints/mast/inan/2023-11-27/080608_VAx9Hcb0THuGhWcZP4I6OA", | |
| "mixing_ratio": null, | |
| "model": { | |
| "alpha_depth": "disabled", | |
| "custom_bwd": true, | |
| "dim": 4096, | |
| "dim_by_layer": "", | |
| "dropout": 0, | |
| "efficient_attn": "cutlass", | |
| "ffn_dim": 512, | |
| "ffn_dim_multiplier": 1.0, | |
| "full_logging_n_layers": 4, | |
| "head_prune": false, | |
| "init": { | |
| "coeff_std": null, | |
| "depth_last": false, | |
| "fixed_std": null, | |
| "no_init": false, | |
| "use_depth": "current", | |
| "use_gaussian": true | |
| }, | |
| "init_on_meta_device": false, | |
| "layer_ckpt": "none", | |
| "loss_parallel": false, | |
| "max_length": 4096, | |
| "multiple_of": 256, | |
| "n_heads": 32, | |
| "n_heads_by_layer": "", | |
| "n_kv_heads": null, | |
| "n_kv_heads_by_layer": "", | |
| "n_layers": 32, | |
| "non_linearity": "swiglu", | |
| "norm_affine": true, | |
| "norm_eps": 1e-05, | |
| "norm_type": "rmsnorm", | |
| "output_size": -1, | |
| "parallel_impl": "FAIRSCALE", | |
| "position_interpolation": 1.0, | |
| "pre_norm": true, | |
| "recompute_attn": true, | |
| "recompute_fc1_out": true, | |
| "recompute_fc3_out": true, | |
| "rope_theta": 10000.0, | |
| "sequence_parallel": false, | |
| "use_rope": true, | |
| "use_xpos": false, | |
| "vocab_size": 32000, | |
| "xpos_gamma": 0.8, | |
| "xpos_scale_base": 4096, | |
| "xpos_theta": 500000.0 | |
| }, | |
| "model_parallel_size": 1, | |
| "no_final_ckpt": false, | |
| "optim": { | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "clip": 1.0, | |
| "cosine_theta": 1.0, | |
| "cycle_length": 1.0, | |
| "epsilon": 1e-08, | |
| "exp_factor": 0.5, | |
| "fused": null, | |
| "lr": 2e-06, | |
| "lr_min_ratio": 0.1, | |
| "scheduler": "cosine", | |
| "use_deprecated_optim": false, | |
| "use_sgd": false, | |
| "warmup": 100, | |
| "weight_decay": 0.1 | |
| }, | |
| "peft_args": null, | |
| "periodic_gpu_check": true, | |
| "profile_freq": -1, | |
| "reshard_after_forward": true, | |
| "restore_dataloader_position": false, | |
| "rlhf": null, | |
| "root_dump_dir": "/tmp/nobody/xldumps", | |
| "secondary_hive_data": null, | |
| "seq_len": 4096, | |
| "snapshot_restore_dir": null, | |
| "steps": 3000, | |
| "stuck_threshold_sec": 1500, | |
| "tb_upload_freq": 50, | |
| "tokenizer": "tokenizer_final_32k.minus_inf_ws.model", | |
| "tokenizer_dir": "/tmp/metaformers_dmp/tokenizer", | |
| "torch_seed": -1, | |
| "unlimited_steps": false, | |
| "valid": { | |
| "batch_size": 8, | |
| "content_key": null, | |
| "custom_preference_task_table1": "", | |
| "custom_preference_task_table2": "", | |
| "debug": false, | |
| "hive_data": null, | |
| "hive_tasks": [], | |
| "hive_tasks_output_hive_data": null, | |
| "instruct": { | |
| "is_instruct_tuning": true, | |
| "no_loss_prompt": true, | |
| "no_loss_truncated": false, | |
| "only_sft_last_response": false, | |
| "smart_coalesce": false, | |
| "space_around_response": false, | |
| "wrap_seq_tokens_once": false | |
| }, | |
| "iso_regression_model_path": "", | |
| "majority_voting": 0, | |
| "n_batches": 100, | |
| "n_generations": 1, | |
| "ppl_files_str": "", | |
| "ppl_root_dir": "", | |
| "prompt_path": "", | |
| "random_fewshots": false, | |
| "repetition_penalty": 1.0, | |
| "rlhf_eval": false, | |
| "seq_len": 2048, | |
| "task_batch_size": 8, | |
| "tasks_root_dir": "/tmp/metaformers_dmp/data/eval", | |
| "tasks_str": "safetyllama_prompt,safetyllama_response", | |
| "temperature": 1.0, | |
| "top_k": 0, | |
| "top_p": 0.0, | |
| "use_llm_inference": true, | |
| "use_relative_loss": true, | |
| "use_sampling": false, | |
| "write_eval": true, | |
| "write_every_n_batches": 1 | |
| } | |
| } |