qmaru
/

gemma3-sms

@@ -1,8 +1,17 @@
 from unsloth import FastModel
-from datasets import load_dataset
-from trl import SFTConfig, SFTTrainer
 from unsloth.chat_templates import get_chat_template, train_on_responses_only
 import torch
 max_seq_length = 2048
@@ -12,10 +21,13 @@ model, tokenizer = FastModel.from_pretrained(
     load_in_4bit=False,  # 4 bit quantization to reduce memory
     load_in_8bit=False,  # [NEW!] A bit more accurate, uses 2x memory
     full_finetuning=False,  # [NEW!] We have full finetuning now!
-    dtype=torch.bfloat16,
     # token = "hf_...", # use one if using gated models
 )
 model = FastModel.get_peft_model(
     model,
     r=128,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
@@ -43,8 +55,8 @@ tokenizer = get_chat_template(
     chat_template="gemma3",
 )
-# dataset = load_dataset("json", data_files="dataset_min.json", split="train")
-dataset = load_dataset("qmaru/gemma3-sms", split="train")
 dataset = dataset.shuffle(seed=42)
@@ -96,7 +108,8 @@ trainer = SFTTrainer(
         remove_unused_columns=True,
         dataloader_pin_memory=True,
         dataloader_num_workers=4,
-        bf16=True,  # Use 16-bit precision for training
     ),
 )
@@ -109,5 +122,8 @@ trainer = train_on_responses_only(
 trainer_stats = trainer.train()
 model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
-model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0")
 model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")

+import os
 from unsloth import FastModel
 from unsloth.chat_templates import get_chat_template, train_on_responses_only
 import torch
+from trl.trainer.sft_config import SFTConfig
+from trl.trainer.sft_trainer import SFTTrainer
+from datasets import load_dataset
+torch.backends.cudnn.benchmark = True
+use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+use_fp16 = torch.cuda.is_available() and not use_bf16
 max_seq_length = 2048
     load_in_4bit=False,  # 4 bit quantization to reduce memory
     load_in_8bit=False,  # [NEW!] A bit more accurate, uses 2x memory
     full_finetuning=False,  # [NEW!] We have full finetuning now!
+    dtype=torch.bfloat16 if use_bf16 else torch.float16 if use_fp16 else torch.float32,
     # token = "hf_...", # use one if using gated models
 )
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
 model = FastModel.get_peft_model(
     model,
     r=128,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
     chat_template="gemma3",
 )
+dataset = load_dataset("json", data_files="datasets/code/data.json", split="train")
+# dataset = load_dataset("qmaru/gemma3-sms", split="train")
 dataset = dataset.shuffle(seed=42)
         remove_unused_columns=True,
         dataloader_pin_memory=True,
         dataloader_num_workers=4,
+        bf16=use_bf16,
+        fp16=use_fp16,
     ),
 )
 trainer_stats = trainer.train()
 model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
 model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
+model.save_pretrained_gguf("model", tokenizer, quantization_method="q8_0")
+# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
+os.system("./llama.cpp/build/bin/llama-quantize model.F16.gguf model.Q4_K_M.gguf 15")