nemo3-finetune / sft_nemo3_native.py
burtenshaw's picture
burtenshaw HF Staff
Upload sft_nemo3_native.py with huggingface_hub
09817db verified
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "torch",
# "transformers>=4.57.0",
# "trl>=0.12.0",
# "datasets",
# "peft>=0.7.0",
# "accelerate",
# "bitsandbytes",
# "sentencepiece",
# "protobuf",
# "trackio",
# ]
# ///
from __future__ import annotations
import os
from typing import Any, Dict
os.environ.setdefault("HF_HOME", "./.hf_home")
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer
MODEL_ID = os.environ.get("MODEL_ID", "unsloth/Nemotron-3-Nano-30B-A3B")
DATASET_NAME = os.environ.get("DATASET_NAME", "HuggingFaceH4/Multilingual-Thinking")
HUB_MODEL_ID = os.environ.get("HUB_MODEL_ID", "burtenshaw/nemotron3-nano-multilingual-thinking")
MAX_STEPS = int(os.environ.get("MAX_STEPS", "100"))
def merge_thinking(example):
new_msgs = []
for msg in example["messages"]:
m = dict(msg)
content = m.get("content", "")
thinking = m.pop("thinking", None)
if thinking and isinstance(thinking, str) and thinking.strip():
content = f"<think>\n{thinking}\n</think>\n{content}"
m["content"] = content
new_msgs.append(m)
return {**example, "messages": new_msgs}
def main():
print(f"[config] model={MODEL_ID} dataset={DATASET_NAME} hub={HUB_MODEL_ID} steps={MAX_STEPS}")
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(42)
print(f"[cuda] {torch.cuda.get_device_name(0)}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("[loading] model...")
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16, use_cache=False, trust_remote_code=True, device_map="auto", low_cpu_mem_usage=True)
peft_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")
print("[loading] dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
drop = [c for c in ["reasoning_language","developer","user","analysis","final"] if c in dataset.column_names]
if drop: dataset = dataset.remove_columns(drop)
dataset = dataset.map(merge_thinking)
def fmt(ex):
return {"text": [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False) for c in ex["messages"]]}
dataset = dataset.map(fmt, batched=True, remove_columns=[c for c in dataset.column_names if c != "text"])
print(f"[info] {len(dataset)} examples")
args = SFTConfig(per_device_train_batch_size=1, gradient_accumulation_steps=8, warmup_steps=10, max_steps=MAX_STEPS, learning_rate=2e-4, optim="paged_adamw_8bit", logging_steps=1, save_steps=25, output_dir="nemotron3-sft", max_length=2048, gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}, bf16=True, push_to_hub=True, hub_model_id=HUB_MODEL_ID, hub_strategy="every_save", report_to="trackio", run_name="nemotron3-multilingual-thinking")
trainer = SFTTrainer(model=model, args=args, train_dataset=dataset, peft_config=peft_config, processing_class=tokenizer)
print("[training]...")
trainer.train()
trainer.push_to_hub()
print(f"[done] https://huggingface.co/{HUB_MODEL_ID}")
if __name__ == "__main__":
main()