|
|
--- |
|
|
library_name: transformers |
|
|
tags: |
|
|
- axolotl |
|
|
- generated_from_trainer |
|
|
datasets: |
|
|
- train_v2.1.jsonl |
|
|
model-index: |
|
|
- name: llama33_fix3 |
|
|
results: [] |
|
|
--- |
|
|
|
|
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You |
|
|
should probably proofread and complete it, then remove this comment. --> |
|
|
|
|
|
[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl) |
|
|
<details><summary>See axolotl config</summary> |
|
|
|
|
|
axolotl version: `0.8.1` |
|
|
```yaml |
|
|
# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files |
|
|
# This can also be a relative path to a model on disk |
|
|
base_model: ./step1-embed-model/merged/ |
|
|
|
|
|
# Use CUDA bf16 |
|
|
bf16: auto |
|
|
tf32: false |
|
|
|
|
|
# List[str]. Add plugins to extend the pipeline. |
|
|
# See `src/axolotl/integrations` for the available plugins or doc below for more details. |
|
|
# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html |
|
|
plugins: |
|
|
- axolotl.integrations.liger.LigerPlugin |
|
|
liger_rope: true |
|
|
liger_rms_norm: true |
|
|
liger_glu_activation: true |
|
|
liger_layer_norm: true |
|
|
liger_fused_linear_cross_entropy: true |
|
|
|
|
|
# A list of one or more datasets to finetune the model with |
|
|
datasets: |
|
|
# HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files |
|
|
- path: train_v2.1.jsonl |
|
|
# The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] |
|
|
type: input_output # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn> |
|
|
|
|
|
# If false, the datasets will not be shuffled and will keep their original order in `datasets`. |
|
|
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. |
|
|
shuffle_merged_datasets: false |
|
|
|
|
|
# Axolotl attempts to save the dataset as an arrow after packing the data together so |
|
|
# subsequent training attempts load faster, relative path |
|
|
dataset_prepared_path: data/llama33fix_prepared |
|
|
# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` |
|
|
# if not set. |
|
|
dataset_processes: 1 |
|
|
# push checkpoints to hub |
|
|
hub_model_id: AlexHung29629/llama33_fix3 |
|
|
hub_strategy: end |
|
|
# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets |
|
|
# Required to be true when used in combination with `push_dataset_to_hub` |
|
|
hf_use_auth_token: true |
|
|
# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval. |
|
|
val_set_size: 0 |
|
|
|
|
|
# The maximum length of an input to train with, this should typically be less than 2048 |
|
|
# as most models have a token/context limit of 2048 |
|
|
sequence_len: 2048 |
|
|
# Pad inputs so each step uses constant sized buffers |
|
|
# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently |
|
|
pad_to_sequence_len: true |
|
|
# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true' |
|
|
sample_packing: false |
|
|
|
|
|
# wandb configuration if you're using it |
|
|
# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`. |
|
|
wandb_mode: disabled |
|
|
|
|
|
# Tensorboard |
|
|
use_tensorboard: true |
|
|
|
|
|
# Where to save the full-finetuned model to |
|
|
output_dir: ./step2-model |
|
|
|
|
|
|
|
|
# Training hyperparameters |
|
|
|
|
|
# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps. |
|
|
gradient_accumulation_steps: 1 |
|
|
# The number of samples to include in each batch. This is the number of samples sent to each GPU. |
|
|
# Batch size per gpu = micro_batch_size * gradient_accumulation_steps |
|
|
micro_batch_size: 1 |
|
|
num_epochs: 4 |
|
|
warmup_ratio: 0.0 # cannot use with warmup_steps |
|
|
learning_rate: 2e-6 |
|
|
logging_steps: 1 |
|
|
save_strategy: epoch |
|
|
#save_steps: 1000 |
|
|
saves_per_epoch: 1 |
|
|
|
|
|
# Whether to use gradient checkpointing. Available options are: true, false, "offload". |
|
|
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing |
|
|
gradient_checkpointing: true |
|
|
# additional kwargs to pass to the trainer for gradient checkpointing |
|
|
#gradient_checkpointing_kwargs: |
|
|
# use_reentrant: true |
|
|
|
|
|
# Specify a scheduler and kwargs to use with the optimizer |
|
|
lr_scheduler: cosine |
|
|
|
|
|
optimizer: adamw_torch |
|
|
|
|
|
# Specify weight decay |
|
|
weight_decay: 0 |
|
|
# adamw hyperparams |
|
|
adam_beta1: 0.9 |
|
|
adam_beta2: 0.95 |
|
|
adam_epsilon: 1e-8 |
|
|
# Gradient clipping max norm |
|
|
max_grad_norm: 1 |
|
|
|
|
|
# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention: |
|
|
flash_attention: true |
|
|
fsdp_final_state_dict_type: SHARDED_STATE_DICT |
|
|
fsdp: |
|
|
- full_shard |
|
|
- auto_wrap |
|
|
fsdp_config: |
|
|
fsdp_limit_all_gathers: true |
|
|
fsdp_sync_module_states: true |
|
|
fsdp_offload_params: true |
|
|
fsdp_use_orig_params: false |
|
|
fsdp_cpu_ram_efficient_loading: true |
|
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP |
|
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer |
|
|
fsdp_state_dict_type: FULL_STATE_DICT |
|
|
fsdp_sharding_strategy: FULL_SHARD |
|
|
fsdp_backward_prefetch: BACKWARD_PRE |
|
|
special_tokens: |
|
|
pad_token: <|finetune_right_pad_id|> |
|
|
eos_token: <|eot_id|> |
|
|
# Deepspeed config path. e.g., deepspeed_configs/zero3.json |
|
|
#deepspeed: /mnt/shared/twsc/alex/reasoning/zero3_bf16.json |
|
|
|
|
|
# Seed |
|
|
seed: 42 |
|
|
save_only_model: true |
|
|
|
|
|
``` |
|
|
|
|
|
</details><br> |
|
|
|
|
|
# llama33_fix3 |
|
|
|
|
|
This model was trained from scratch on the train_v2.1.jsonl dataset. |
|
|
|
|
|
## Model description |
|
|
|
|
|
More information needed |
|
|
|
|
|
## Intended uses & limitations |
|
|
|
|
|
More information needed |
|
|
|
|
|
## Training and evaluation data |
|
|
|
|
|
More information needed |
|
|
|
|
|
## Training procedure |
|
|
|
|
|
### Training hyperparameters |
|
|
|
|
|
The following hyperparameters were used during training: |
|
|
- learning_rate: 2e-06 |
|
|
- train_batch_size: 1 |
|
|
- eval_batch_size: 1 |
|
|
- seed: 42 |
|
|
- distributed_type: multi-GPU |
|
|
- num_devices: 8 |
|
|
- total_train_batch_size: 8 |
|
|
- total_eval_batch_size: 8 |
|
|
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments |
|
|
- lr_scheduler_type: cosine |
|
|
- num_epochs: 4.0 |
|
|
|
|
|
### Training results |
|
|
|
|
|
|
|
|
|
|
|
### Framework versions |
|
|
|
|
|
- Transformers 4.51.0 |
|
|
- Pytorch 2.6.0+cu124 |
|
|
- Datasets 3.5.0 |
|
|
- Tokenizers 0.21.1 |
|
|
|