CodeInsight / config /model.yaml
GitHub Actions
Sync from GitHub Actions
c2af030
dataset:
name: "mohsin416/Python-Alpaca-5k"
shuffle_seed: 42
SYSTEM_PROMPT: "<|system|>\nYou are a senior Python developer. Provide clear, correct, well-commented code.<|end|>\n\n"
USER_TOKEN: "<|user|>\n"
ASSISTANT_TOKEN: "<|assistant|>\n"
END_TOKEN: "<|end|>"
model:
base_model_id: "microsoft/Phi-3-mini-128k-instruct"
attn_implementation: "flash_attention_2"
quantization:
load_in_4bit: True
bnb_4bit_quant_type: "nf4"
bnb_4bit_compute_dtype: "bfloat16"
bnb_4bit_use_double_quant: True
lora:
r: 32
load_alpha: 32
lora_dropout: 0.1
bias: "None"
task_type: "CAUSAL_LM"
use_rslora: True
paths:
output_dir: "artifacts/outputs"
adapter_save_dir: "artifacts/phi3-python-instruct-adapter"
final_model_repo: "mohsin416/phi3-python-instruct"
training:
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gradient_accumulation_steps: 8
num_train_epochs: 2
learning_rate: 2.0e-5
warmup_ratio: 0.1
warmup_steps: 0
bf16: True
tf32: False
fp16: False
lr_scheduler_type: "cosine"
optim: "paged_adamw_8bit"
gradient_checkpointing: True
gradient_checkpointing_kwargs: {"use_reentrant": False}
max_grad_norm: 1.0
weight_decay: 0.01
logging_steps: 50
eval_steps: 50
save_steps: 50
eval_strategy: "steps"
save_strategy: "steps"
save_total_limit: 3
load_best_model_at_end: True
metric_for_best_model: "eval_loss"
greater_is_better: False
prediction_loss_only: True
report_to: "wandb"
dataloader_num_workers: 4
dataloader_pin_memory: True
max_seq_length: 4096
dataset_text_field: "text"
label_names: ["labels"]
neftune_noise_alpha: 5
wandb:
project_name: "Phi-3-mini-128k-instruct-metrics"