minpeter
/

Alpaca-Llama-3.2-1B-Instruct

@@ -6,7 +6,7 @@ tags:
 - axolotl
 - generated_from_trainer
 datasets:
-- alpaca_data.json
 model-index:
 - name: Alpaca-Llama-3.2-1B-Instruct
   results: []
@@ -20,71 +20,76 @@ should probably proofread and complete it, then remove this comment. -->
 axolotl version: `0.6.0`
 ```yaml
-base_model: meta-llama/Llama-3.2-1B
-model_type: LlamaForCausalLM
-tokenizer_type: PreTrainedTokenizerFast
-strict: false
-save_safetensors: true
-flash_attention: true
-auto_resume_from_checkpoints: true
-save_steps: 100
-learning_rate: 5e-4
-num_epochs: 3
-micro_batch_size: 8
-gradient_accumulation_steps: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
 hub_model_id: minpeter/Alpaca-Llama-3.2-1B-Instruct
-dataset_processes: 5000
-chat_template: jinja
-chat_template_jinja: |-
-  {%- for message in messages %}
-      {%- if message['role'] in ['user', 'assistant'] %}
-          {{- '<|' + message['role'] + '|>\n' }}
-          {{- message['content'] + '\n' }}
-      {%- else %}
-          {{- raise_exception('Invalid role: ' + message['role']) }}
-      {%- endif %}
-  {%- endfor %}
-  {%- if add_generation_prompt %}
-      {{- '<|assistant|>\n' }}
-  {%- endif %}
 datasets:
-  - path: alpaca_data.json
-    type:
-      field_instruction: instruction
-      field_input: input
-      field_output: output
-      format: |
-        <|user|>
-        {instruction} {input}
-        <|assistant|>
-      no_input_format: |
-        <|user|>
-        {instruction}
-        <|assistant|>
-special_tokens:
-  pad_token: <pad>
 wandb_project: "axolotl"
 wandb_entity: "kasfiekfs-e"
 ```
 </details><br>
 # Alpaca-Llama-3.2-1B-Instruct
-This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the alpaca_data.json dataset.
 ## Model description
@@ -103,22 +108,24 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 0.0005
-- train_batch_size: 8
-- eval_batch_size: 8
 - seed: 42
-- distributed_type: multi-GPU
-- num_devices: 8
-- gradient_accumulation_steps: 4
-- total_train_batch_size: 256
-- total_eval_batch_size: 64
-- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
-- lr_scheduler_warmup_steps: 18
-- num_epochs: 3
 ### Training results
 ### Framework versions

 - axolotl
 - generated_from_trainer
 datasets:
+- tatsu-lab/alpaca
 model-index:
 - name: Alpaca-Llama-3.2-1B-Instruct
   results: []
 axolotl version: `0.6.0`
 ```yaml
+base_model: meta-llama/Llama-3.2-1B
 hub_model_id: minpeter/Alpaca-Llama-3.2-1B-Instruct
+load_in_8bit: false
+load_in_4bit: false
+strict: false
 datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+dataset_processes: 1000
+val_set_size: 0.05
+output_dir: ./outputs/out
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
 wandb_project: "axolotl"
 wandb_entity: "kasfiekfs-e"
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: <|end_of_text|>
 ```
 </details><br>
 # Alpaca-Llama-3.2-1B-Instruct
+This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the tatsu-lab/alpaca dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.3881
 ## Model description
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 1
+- eval_batch_size: 1
 - seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 8
+- optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 1
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.5628        | 0.0127 | 1    | 1.5941          |
+| 1.4085        | 0.4960 | 39   | 1.4333          |
+| 1.3727        | 0.9921 | 78   | 1.3881          |
 ### Framework versions

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:547a2ac73e4b7254f8b1ce78c65d9fd6bf777565ee77965b4fb5ff67e56ba14e
+size 2471678226