jrc
/

llama3-8b-coedit

+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 8B_lora_single_device.yaml
+# or 8B_qlora_single_device.yaml
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: ./model/original/tokenizer.model
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.lora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: ./model/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: ./finetuned_model/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+# Dataset and Sampler
+# InstructDataset(
+#        tokenizer=tokenizer,
+#        source=source,
+#        template=GrammarErrorCorrectionTemplate,
+#        column_map={"sentence": "input"},
+#        train_on_input=train_on_input,
+#        split="train",
+#    )
+dataset:
+  _component_: torchtune.datasets.instruct_dataset
+  source: grammarly/coedit
+  template: GrammarErrorCorrectionTemplate
+  column_map: {"sentence": "src", "output": "tgt"}
+  train_on_input: False
+  split: train
+seed: 123
+shuffle: True
+batch_size: 4
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+# Training
+epochs: 2
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+# Logging
+output_dir: ./lora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.WandBLogger
+  project: torchtune
+  group: llama3-grammarly
+log_every_n_steps: null
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False