| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from __future__ import annotations |
| import os |
| from typing import Any, Dict |
| os.environ.setdefault("HF_HOME", "./.hf_home") |
| import torch |
| from datasets import load_dataset |
| from peft import LoraConfig |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from trl import SFTConfig, SFTTrainer |
|
|
| MODEL_ID = os.environ.get("MODEL_ID", "unsloth/Nemotron-3-Nano-30B-A3B") |
| DATASET_NAME = os.environ.get("DATASET_NAME", "HuggingFaceH4/Multilingual-Thinking") |
| HUB_MODEL_ID = os.environ.get("HUB_MODEL_ID", "burtenshaw/nemotron3-nano-multilingual-thinking") |
| MAX_STEPS = int(os.environ.get("MAX_STEPS", "100")) |
|
|
| def merge_thinking(example): |
| new_msgs = [] |
| for msg in example["messages"]: |
| m = dict(msg) |
| content = m.get("content", "") |
| thinking = m.pop("thinking", None) |
| if thinking and isinstance(thinking, str) and thinking.strip(): |
| content = f"<think>\n{thinking}\n</think>\n{content}" |
| m["content"] = content |
| new_msgs.append(m) |
| return {**example, "messages": new_msgs} |
|
|
| def main(): |
| print(f"[config] model={MODEL_ID} dataset={DATASET_NAME} hub={HUB_MODEL_ID} steps={MAX_STEPS}") |
| torch.manual_seed(42) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed_all(42) |
| print(f"[cuda] {torch.cuda.get_device_name(0)}") |
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| print("[loading] model...") |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16, use_cache=False, trust_remote_code=True, device_map="auto", low_cpu_mem_usage=True) |
| |
| peft_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM") |
| |
| print("[loading] dataset...") |
| dataset = load_dataset(DATASET_NAME, split="train") |
| drop = [c for c in ["reasoning_language","developer","user","analysis","final"] if c in dataset.column_names] |
| if drop: dataset = dataset.remove_columns(drop) |
| dataset = dataset.map(merge_thinking) |
| |
| def fmt(ex): |
| return {"text": [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False) for c in ex["messages"]]} |
| dataset = dataset.map(fmt, batched=True, remove_columns=[c for c in dataset.column_names if c != "text"]) |
| print(f"[info] {len(dataset)} examples") |
| |
| args = SFTConfig(per_device_train_batch_size=1, gradient_accumulation_steps=8, warmup_steps=10, max_steps=MAX_STEPS, learning_rate=2e-4, optim="paged_adamw_8bit", logging_steps=1, save_steps=25, output_dir="nemotron3-sft", max_length=2048, gradient_checkpointing=True, gradient_checkpointing_kwargs={"use_reentrant": False}, bf16=True, push_to_hub=True, hub_model_id=HUB_MODEL_ID, hub_strategy="every_save", report_to="trackio", run_name="nemotron3-multilingual-thinking") |
| |
| trainer = SFTTrainer(model=model, args=args, train_dataset=dataset, peft_config=peft_config, processing_class=tokenizer) |
| print("[training]...") |
| trainer.train() |
| trainer.push_to_hub() |
| print(f"[done] https://huggingface.co/{HUB_MODEL_ID}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|