| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Fine-tune `unsloth/Nemotron-3-Nano-30B-A3B` with TRL SFT + LoRA |
| (Jobs-friendly script). |
| |
| Original notebook provenance (for reference): |
| |
| TRL_SFT_Nemotron-3-Nano-30B-A3B_A100 |
| |
| Automatically generated by Colab. |
| Original file is located at: |
| https://colab.research.google.com/drive/1wLKOrvU540gUF6HKe3KotcLCvCijh41V |
| |
| Notebook install cells like `!uv pip install ...` were removed; on HF Jobs, |
| run this as a UV script (`hf jobs uv run`) and dependencies are installed |
| from the `# /// script` block at the top. |
| |
| This file is a refactor of a Colab notebook export into a **non-interactive CLI |
| script** that can be executed in Hugging Face **Jobs** (e.g. `hf jobs run ... |
| python ...`). |
| |
| See `README.md` for copy/paste `hf jobs run` commands and secrets. |
| |
| Docs referenced (Jobs + UV): |
| https://huggingface.co/docs/huggingface_hub/en/guides/jobs |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
| from typing import Any, Dict, List |
|
|
|
|
| def merge_thinking_into_content(example: Dict[str, Any]) -> Dict[str, Any]: |
| |
| new_messages: List[Dict[str, Any]] = [] |
| for msg in example["messages"]: |
| msg2 = dict(msg) |
| content = msg2.get("content", "") |
| thinking = msg2.pop("thinking", None) |
| if isinstance(thinking, str) and thinking.strip(): |
| content = f"<think>\n{thinking}\n</think>\n{content}" |
| msg2["content"] = content |
| new_messages.append(msg2) |
| return {**example, "messages": new_messages} |
|
|
|
|
| def main() -> None: |
| |
| DEFAULT_OUTPUT_DIR = os.environ.get("OUTPUT_DIR") or "nemo3-sft-lora" |
| DEFAULT_HUB_MODEL_ID = os.environ.get("HUB_MODEL_ID") or None |
| _PUSH_TO_HUB_ENV = (os.environ.get("PUSH_TO_HUB") or "").strip().lower() |
| DEFAULT_PUSH_TO_HUB = _PUSH_TO_HUB_ENV in ("1", "true", "yes", "y", "on") |
| DEFAULT_ATTN_IMPL = os.environ.get("ATTN_IMPL") or "eager" |
| DEFAULT_DTYPE = (os.environ.get("DTYPE") or "float16").strip().lower() |
| DEFAULT_SEED = int((os.environ.get("SEED") or "42").strip()) |
|
|
| p = argparse.ArgumentParser( |
| description=("SFT + LoRA finetune Nemotron-3-Nano-30B-A3B with TRL, suitable for HF Jobs.") |
| ) |
|
|
| p.add_argument("--model-id", default="unsloth/Nemotron-3-Nano-30B-A3B") |
| p.add_argument("--dataset-name", default="HuggingFaceH4/Multilingual-Thinking") |
| p.add_argument("--dataset-split", default="train") |
|
|
| p.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR) |
| p.add_argument( |
| "--hub-model-id", |
| default=DEFAULT_HUB_MODEL_ID, |
| help=( |
| "Optional explicit repo id (e.g. 'username/nemo3-sft-lora'). " |
| "If omitted and push_to_hub is true, TRL uses output_dir." |
| ), |
| ) |
| p.add_argument("--push-to-hub", action="store_true", default=DEFAULT_PUSH_TO_HUB) |
|
|
| p.add_argument("--max-steps", type=int, default=30) |
| p.add_argument("--per-device-train-batch-size", type=int, default=1) |
| p.add_argument("--gradient-accumulation-steps", type=int, default=4) |
| p.add_argument("--warmup-steps", type=int, default=5) |
| p.add_argument("--learning-rate", type=float, default=2e-4) |
| p.add_argument("--max-length", type=int, default=128) |
| p.add_argument("--logging-steps", type=int, default=1) |
|
|
| p.add_argument( |
| "--attn-implementation", |
| default=DEFAULT_ATTN_IMPL, |
| help="e.g. 'eager' or 'flash_attention_2' (if supported).", |
| ) |
| p.add_argument( |
| "--dtype", |
| default=DEFAULT_DTYPE, |
| choices=["float16", "bfloat16"], |
| ) |
| p.add_argument("--seed", type=int, default=DEFAULT_SEED) |
|
|
| cfg = p.parse_args() |
|
|
| |
| os.environ.setdefault("HF_HOME", os.path.abspath("./.hf_home")) |
| os.environ.setdefault( |
| "TRANSFORMERS_CACHE", |
| os.path.abspath("./.hf_home/transformers"), |
| ) |
| os.environ.setdefault( |
| "HF_DATASETS_CACHE", |
| os.path.abspath("./.hf_home/datasets"), |
| ) |
|
|
| import torch |
| from datasets import load_dataset |
| from peft import LoraConfig |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| ) |
| from trl import SFTConfig, SFTTrainer |
|
|
| torch.manual_seed(cfg.seed) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed_all(cfg.seed) |
| torch.backends.cuda.matmul.allow_tf32 = True |
|
|
| torch_dtype = torch.float16 if cfg.dtype == "float16" else torch.bfloat16 |
|
|
| print(f"[config] model_id={cfg.model_id}") |
| print(f"[config] dataset={cfg.dataset_name}:{cfg.dataset_split}") |
| print(f"[config] output_dir={cfg.output_dir} push_to_hub={cfg.push_to_hub} hub_model_id={cfg.hub_model_id}") |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| cfg.model_id, |
| trust_remote_code=True, |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| cfg.model_id, |
| attn_implementation=cfg.attn_implementation, |
| torch_dtype=torch_dtype, |
| use_cache=False, |
| trust_remote_code=True, |
| device_map="auto", |
| low_cpu_mem_usage=True, |
| ) |
|
|
| |
| peft_config = LoraConfig( |
| r=8, |
| lora_alpha=16, |
| target_modules=[ |
| "q_proj", |
| "k_proj", |
| "v_proj", |
| "o_proj", |
| "gate_proj", |
| "up_proj", |
| "down_proj", |
| ], |
| ) |
|
|
| dataset = load_dataset(cfg.dataset_name, split=cfg.dataset_split) |
| |
| drop_cols = [ |
| c |
| for c in [ |
| "reasoning_language", |
| "developer", |
| "user", |
| "analysis", |
| "final", |
| ] |
| if c in dataset.column_names |
| ] |
| if drop_cols: |
| dataset = dataset.remove_columns(column_names=drop_cols) |
|
|
| dataset = dataset.map(merge_thinking_into_content) |
|
|
| def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]: |
| convos = examples["messages"] |
| texts = [ |
| tokenizer.apply_chat_template( |
| convo, |
| tokenize=False, |
| add_generation_prompt=False, |
| ) |
| for convo in convos |
| ] |
| return {"text": texts} |
|
|
| dataset = dataset.map( |
| formatting_prompts_func, |
| batched=True, |
| remove_columns=[c for c in dataset.column_names if c != "text"], |
| ) |
|
|
| |
| report_to: List[str] = [] |
|
|
| training_args = SFTConfig( |
| per_device_train_batch_size=cfg.per_device_train_batch_size, |
| gradient_accumulation_steps=cfg.gradient_accumulation_steps, |
| warmup_steps=cfg.warmup_steps, |
| max_steps=cfg.max_steps, |
| learning_rate=cfg.learning_rate, |
| optim="paged_adamw_8bit", |
| logging_steps=cfg.logging_steps, |
| report_to=report_to, |
| output_dir=cfg.output_dir, |
| max_length=cfg.max_length, |
| activation_offloading=True, |
| gradient_checkpointing=True, |
| gradient_checkpointing_kwargs={"use_reentrant": False}, |
| push_to_hub=cfg.push_to_hub, |
| hub_model_id=cfg.hub_model_id, |
| ) |
|
|
| trainer = SFTTrainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| peft_config=peft_config, |
| processing_class=tokenizer, |
| ) |
|
|
| trainer.train() |
|
|
| |
| trainer.save_model(cfg.output_dir) |
| tokenizer.save_pretrained(cfg.output_dir) |
|
|
| if cfg.push_to_hub: |
| |
| |
| print("[hub] pushing to hub...") |
| trainer.push_to_hub() |
|
|
| print("[done] training complete") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|