Final_Assignment_Template

Sleeping

App Files Files Community

mjschock commited on May 2, 2025

Commit

7749830

unverified ·

1 Parent(s): 352f525

Add training script for SmolLM2-135M model using Unsloth. Includes model loading, dataset preparation, and training configuration. Provides detailed instructions for setup and execution.

Browse files

Files changed (1) hide show

train.py +174 -0

train.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/usr/bin/env python3
+"""
+Fine-tuning script for SmolLM2-135M model using Unsloth.
+This script demonstrates how to:
+1. Install and configure Unsloth
+2. Prepare and format training data
+3. Configure and run the training process
+4. Save and evaluate the model
+To run this script:
+1. Install dependencies: pip install -r requirements.txt
+2. Run: python train.py
+"""
+import os
+from typing import Union
+from datasets import (
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    load_dataset,
+)
+from transformers import AutoTokenizer, Trainer, TrainingArguments
+from trl import SFTTrainer
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from unsloth.chat_templates import get_chat_template
+# Configuration
+max_seq_length = 2048  # Auto supports RoPE Scaling internally
+dtype = (
+    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+)
+load_in_4bit = True  # Use 4bit quantization to reduce memory usage
+# def install_dependencies():
+#     """Install required dependencies."""
+#     os.system('pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"')
+#     os.system('pip install --no-deps xformers trl peft accelerate bitsandbytes')
+def load_model() -> tuple[FastLanguageModel, AutoTokenizer]:
+    """Load and configure the model."""
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
+        max_seq_length=max_seq_length,
+        dtype=dtype,
+        load_in_4bit=load_in_4bit,
+    )
+    # Configure LoRA
+    model = FastLanguageModel.get_peft_model(
+        model,
+        r=64,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        lora_alpha=128,
+        lora_dropout=0.05,
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=3407,
+        use_rslora=True,
+        loftq_config=None,
+    )
+    return model, tokenizer
+def load_and_format_dataset(
+    tokenizer: AutoTokenizer,
+) -> tuple[
+    Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], AutoTokenizer
+]:
+    """Load and format the training dataset."""
+    # Load the code-act dataset
+    dataset = load_dataset("xingyaoww/code-act", split="codeact")
+    # Configure chat template
+    tokenizer = get_chat_template(
+        tokenizer,
+        chat_template="chatml",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
+        mapping={
+            "role": "from",
+            "content": "value",
+            "user": "human",
+            "assistant": "gpt",
+        },  # ShareGPT style
+        map_eos_token=True,  # Maps <|im_end|> to </s> instead
+    )
+    def formatting_prompts_func(examples):
+        convos = examples["conversations"]
+        texts = [
+            tokenizer.apply_chat_template(
+                convo, tokenize=False, add_generation_prompt=False
+            )
+            for convo in convos
+        ]
+        return {"text": texts}
+    # Apply formatting to dataset
+    dataset = dataset.map(formatting_prompts_func, batched=True)
+    return dataset, tokenizer
+def create_trainer(
+    model: FastLanguageModel,
+    tokenizer: AutoTokenizer,
+    dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
+) -> Trainer:
+    """Create and configure the SFTTrainer."""
+    return SFTTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=max_seq_length,
+        dataset_num_proc=2,
+        packing=False,
+        args=TrainingArguments(
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=16,
+            warmup_steps=100,
+            max_steps=120,
+            learning_rate=5e-5,
+            fp16=not is_bfloat16_supported(),
+            bf16=is_bfloat16_supported(),
+            logging_steps=1,
+            optim="adamw_8bit",
+            weight_decay=0.01,
+            lr_scheduler_type="cosine_with_restarts",
+            seed=3407,
+            output_dir="outputs",
+            gradient_checkpointing=True,
+            save_strategy="steps",
+            save_steps=30,
+            save_total_limit=2,
+        ),
+    )
+def main():
+    """Main training function."""
+    # Install dependencies
+    # install_dependencies()
+    # Load model and tokenizer
+    model, tokenizer = load_model()
+    # Load and prepare dataset
+    dataset, tokenizer = load_and_format_dataset(tokenizer)
+    # Create trainer
+    trainer: Trainer = create_trainer(model, tokenizer, dataset)
+    # Train
+    trainer.train()
+    # Save model
+    trainer.save_model("final_model")
+if __name__ == "__main__":
+    main()