| name: gemma-7b-sql-nemo | |
| trainer: | |
| num_nodes: 1 | |
| devices: 8 | |
| accelerator: gpu | |
| precision: bf16 | |
| logger: false | |
| enable_checkpointing: false | |
| use_distributed_sampler: false | |
| max_time: null | |
| max_epochs: 1 | |
| max_steps: -1 | |
| sft: | |
| max_epochs: 1 | |
| max_steps: -1 | |
| val_check_interval: 1000 | |
| save_interval: 1000 | |
| limit_val_batches: 40 | |
| gradient_clip_val: 1.0 | |
| exp_manager: | |
| explicit_log_dir: models/gemma-7b-sql-nemo | |
| exp_dir: null | |
| name: gemma-7b-sql-nemo | |
| create_wandb_logger: false | |
| wandb_logger_kwargs: | |
| project: null | |
| name: null | |
| resume_if_exists: true | |
| resume_ignore_no_checkpoint: true | |
| create_checkpoint_callback: true | |
| checkpoint_callback_params: | |
| monitor: validation_loss | |
| save_top_k: 5 | |
| mode: min | |
| save_nemo_on_train_end: true | |
| filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch} | |
| model_parallel_size: 4 | |
| save_best_model: false | |
| model: | |
| seed: 1234 | |
| tensor_model_parallel_size: 4 | |
| pipeline_model_parallel_size: 1 | |
| restore_from_path: /workspace/models/pytorch-7b-pt.nemo | |
| resume_from_checkpoint: null | |
| save_nemo_on_validation_end: true | |
| sync_batch_comm: false | |
| megatron_amp_O2: true | |
| encoder_seq_length: 8192 | |
| sequence_parallel: false | |
| activations_checkpoint_granularity: null | |
| activations_checkpoint_method: null | |
| activations_checkpoint_num_layers: null | |
| activations_checkpoint_layers_per_pipeline: null | |
| answer_only_loss: true | |
| gradient_as_bucket_view: false | |
| seq_len_interpolation_factor: null | |
| use_flash_attention: null | |
| hidden_dropout: 0.0 | |
| attention_dropout: 0.0 | |
| ffn_dropout: 0.0 | |
| peft: | |
| peft_scheme: none | |
| restore_from_path: null | |
| lora_tuning: | |
| target_modules: | |
| - attention_qkv | |
| adapter_dim: 32 | |
| adapter_dropout: 0.0 | |
| column_init_method: xavier | |
| row_init_method: zero | |
| layer_selection: null | |
| weight_tying: false | |
| position_embedding_strategy: null | |
| data: | |
| chat: false | |
| chat_prompt_tokens: | |
| system_turn_start: "\0" | |
| turn_start: "\x11" | |
| label_start: "\x12" | |
| end_of_turn: ' | |
| ' | |
| end_of_name: ' | |
| ' | |
| sample: false | |
| num_workers: 0 | |
| dataloader_type: single | |
| train_ds: | |
| file_path: nsql.jsonl | |
| global_batch_size: 128 | |
| micro_batch_size: 1 | |
| shuffle: true | |
| memmap_workers: null | |
| max_seq_length: 8192 | |
| min_seq_length: 1 | |
| drop_last: true | |
| label_key: output | |
| add_eos: true | |
| add_sep: false | |
| add_bos: false | |
| truncation_field: input | |
| index_mapping_dir: null | |
| prompt_template: '{input} {output}' | |
| hf_dataset: false | |
| truncation_method: right | |
| validation_ds: | |
| file_path: nsql.jsonl | |
| global_batch_size: 128 | |
| micro_batch_size: 1 | |
| shuffle: false | |
| memmap_workers: null | |
| max_seq_length: 8192 | |
| min_seq_length: 1 | |
| drop_last: true | |
| label_key: output | |
| add_eos: true | |
| add_sep: false | |
| add_bos: false | |
| truncation_field: input | |
| index_mapping_dir: null | |
| prompt_template: '{input} {output}' | |
| hf_dataset: false | |
| truncation_method: right | |
| output_original_text: true | |
| optim: | |
| name: distributed_fused_adam | |
| lr: 5.0e-06 | |
| weight_decay: 0.01 | |
| betas: | |
| - 0.9 | |
| - 0.98 | |
| sched: | |
| name: CosineAnnealing | |
| warmup_steps: 10 | |
| constant_steps: 1000 | |
| min_lr: 9.0e-07 | |
| bias_activation_fusion: true | |
| precision: bf16 | |