| { | |
| "async_checkpointing": false, | |
| "async_eval_ngpus": -1, | |
| "batch_size": 2, | |
| "data": "", | |
| "disable_logging": false, | |
| "disable_workers_print": false, | |
| "dtype": "bf16", | |
| "dump_after_steps": 0, | |
| "dump_dir": "/fsx-onellm/rpasunuru/SFT/v2.1_textpp_7b_1366k_sftv1.4_exp1/v2.1_textpp_7b_1366k_sftv1.4_exp1_run000", | |
| "dump_freq": 400, | |
| "dump_profile_traces": false, | |
| "enable_loss_tracker": false, | |
| "epochs": -1, | |
| "eval_freq": 400, | |
| "exp_id": "", | |
| "exp_name": "", | |
| "finetuning_dir": "/fsx-onellm/shared/from_rsc/v2.1_7b_dr_qk_zloss_linear_zero3_sft_optiml_textpp_run000_checkpoint_1366000", | |
| "fp32_reduce_scatter": "all", | |
| "gpu_check_level": 3, | |
| "image_loss_weight": 1.0, | |
| "image_text_rotation_prob": 0.0, | |
| "instruct": { | |
| "no_loss_prompt": true, | |
| "no_loss_truncated": false, | |
| "use_eot": true | |
| }, | |
| "instruct_data": "/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/long_caption:2.92,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/vqa:4.59,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/text2image:10.44,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/llama2_rjv6_helpful:43.27,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/code_llama:0.51,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/interleaved_batch1-17:27.45,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/image_dialogue:7.46,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/llama2_rjv6_harmless:0.97,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/cybersec_safety:0.33,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/onellm_multimodal_safety:0.86,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/autosafety:0.51,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/rainbow_safety:0.10,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/genai_safety:0.58", | |
| "iter_gopher": { | |
| "buffer_size": 16, | |
| "max_precompute": 10, | |
| "n_chars_by_tok": 15, | |
| "n_seqs_to_concat": 10, | |
| "num_processes": 1 | |
| }, | |
| "iter_jsonl": { | |
| "buffer_size": 64, | |
| "same_data": false | |
| }, | |
| "iter_multi": { | |
| "buffer_size": 512, | |
| "ignore_extra_chunks": true, | |
| "max_precompute": 20, | |
| "multiprocess": true | |
| }, | |
| "iter_type": "multi", | |
| "keep_checkpoints_every_steps": 400, | |
| "keep_eval_checkpoints": true, | |
| "keep_n_last_checkpoints": 2, | |
| "log_all_steps": false, | |
| "log_freq": 10, | |
| "log_updates": true, | |
| "log_wandb": false, | |
| "loss_rescaling": false, | |
| "model": { | |
| "add_extra_toks": "0", | |
| "alpha_depth": "disabled", | |
| "attn_dropout": 0, | |
| "attn_to_keep": "all", | |
| "custom_bwd": false, | |
| "dim": 4096, | |
| "dropout": 0.05, | |
| "efficient_attn": "flash", | |
| "emb_dropout": 0, | |
| "ffn_dim_multiplier": 1.0, | |
| "ffn_dropout": 0, | |
| "full_logging_n_layers": 4, | |
| "fuse_sequence_parallel": false, | |
| "init": { | |
| "coeff_std": null, | |
| "depth_last": false, | |
| "fixed_std": null, | |
| "no_init": false, | |
| "pos_init_scalar": null, | |
| "use_depth": "current", | |
| "use_gaussian": true | |
| }, | |
| "layer_ckpt": "none", | |
| "linear_residual_dropout": false, | |
| "loss_parallel": false, | |
| "max_length": 2048, | |
| "multiple_of": 256, | |
| "n_heads": 32, | |
| "n_kv_heads": null, | |
| "n_layers": 32, | |
| "non_linearity": "swiglu", | |
| "norm_affine": true, | |
| "norm_eps": 1e-05, | |
| "norm_type": "rmsnorm", | |
| "output_dropout": 0, | |
| "output_size": -1, | |
| "pre_norm": true, | |
| "qk_normalization": true, | |
| "recompute_attn": true, | |
| "recompute_fc1_out": true, | |
| "recompute_fc3_out": true, | |
| "residual_dropout": 0.0, | |
| "rope_theta": 10000.0, | |
| "sequence_parallel": false, | |
| "swin_norm": false, | |
| "turn_eos_token": "<eos>", | |
| "use_rope": true, | |
| "vocab_size": 65536 | |
| }, | |
| "model_parallel_size": 1, | |
| "no_final_ckpt": false, | |
| "num_retrieved_docs": 0, | |
| "old_mp": -1, | |
| "old_world_size": -1, | |
| "optim": { | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "clip": 1.0, | |
| "cosine_theta": 1.0, | |
| "cycle_length": 1.0, | |
| "epsilon": 1e-08, | |
| "exp_factor": 0.5, | |
| "lr": 1e-05, | |
| "lr_min_ratio": 0.1, | |
| "scheduler": "cosine", | |
| "use_deprecated_optim": false, | |
| "warmup": 100, | |
| "weight_decay": 0.1 | |
| }, | |
| "periodic_gpu_check": true, | |
| "profile_freq": -1, | |
| "reshard_after_forward": false, | |
| "restore_dataloader_position": false, | |
| "retrieval_prob": 0.0, | |
| "rlhf": null, | |
| "root_dump_dir": "", | |
| "save_optimizer_states": true, | |
| "seq_len": 4096, | |
| "slurm": { | |
| "global_rank": 0, | |
| "is_slurm_job": true, | |
| "world_size": 64 | |
| }, | |
| "steps": 1200, | |
| "tokenizer": "/fsx-onellm/rpasunuru/models/cm3z/cm3v2_7b_placeholder/gpt2-unified-image-sentinel.json", | |
| "tokenizer_dir": "/fsx/guismay/data/large_experiments/fair_llm/datasets/tokenizers", | |
| "torch_seed": -1, | |
| "unlimited_steps": false, | |
| "use_hf_tokenizer": true, | |
| "valid": { | |
| "batch_size": 32, | |
| "debug": false, | |
| "majority_voting": 0, | |
| "n_batches": 100, | |
| "onellm_eval": false, | |
| "onellm_eval_media_storage": "", | |
| "ppl_files_str": "", | |
| "prompt_path": "", | |
| "prompt_templates": "{}", | |
| "random_fewshots": false, | |
| "seq_len": 2048, | |
| "tasks_root_dir": "", | |
| "tasks_str": "", | |
| "temperature": 1.0, | |
| "top_k": 0, | |
| "top_p": 0.0, | |
| "use_sampling": false, | |
| "write_eval": false | |
| }, | |
| "wandb_entity": "violet-zct", | |
| "wandb_project": "instruct_sft", | |
| "water_marking_codes_str": null, | |
| "z_loss_weight": 0.0001 | |
| } |