Upload sft_nemo3.py with huggingface_hub

0bf8e96 verified 5 months ago

8.57 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# pylint: disable=import-error
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "torch",
	# "transformers==4.56.2",
	# "trl==0.22.2",
	# "datasets",
	# "peft",
	# "accelerate",
	# "bitsandbytes",
	# "sentencepiece",
	# "protobuf",
	# "mamba_ssm==2.2.5",
	# "causal_conv1d==1.5.2",
	# ]
	# ///

	"""
	Fine-tune `unsloth/Nemotron-3-Nano-30B-A3B` with TRL SFT + LoRA
	(Jobs-friendly script).

	Original notebook provenance (for reference):

	TRL_SFT_Nemotron-3-Nano-30B-A3B_A100

	Automatically generated by Colab.
	Original file is located at:
	https://colab.research.google.com/drive/1wLKOrvU540gUF6HKe3KotcLCvCijh41V

	Notebook install cells like `!uv pip install ...` were removed; on HF Jobs,
	run this as a UV script (`hf jobs uv run`) and dependencies are installed
	from the `# /// script` block at the top.

	This file is a refactor of a Colab notebook export into a **non-interactive CLI
	script that can be executed in Hugging Face Jobs** (e.g. `hf jobs run ...
	python ...`).

	See `README.md` for copy/paste `hf jobs run` commands and secrets.

	Docs referenced (Jobs + UV):
	https://huggingface.co/docs/huggingface_hub/en/guides/jobs
	"""

	from __future__ import annotations

	import argparse
	import os
	from typing import Any, Dict, List


	def merge_thinking_into_content(example: Dict[str, Any]) -> Dict[str, Any]:
	# Fold optional "thinking" into message content using <think> tags.
	new_messages: List[Dict[str, Any]] = []
	for msg in example["messages"]:
	msg2 = dict(msg)
	content = msg2.get("content", "")
	thinking = msg2.pop("thinking", None)
	if isinstance(thinking, str) and thinking.strip():
	content = f"<think>\n{thinking}\n</think>\n{content}"
	msg2["content"] = content
	new_messages.append(msg2)
	return {**example, "messages": new_messages}


	def main() -> None:
	# Environment-backed defaults (no helper function; just variables).
	DEFAULT_OUTPUT_DIR = os.environ.get("OUTPUT_DIR") or "nemo3-sft-lora"
	DEFAULT_HUB_MODEL_ID = os.environ.get("HUB_MODEL_ID") or None
	_PUSH_TO_HUB_ENV = (os.environ.get("PUSH_TO_HUB") or "").strip().lower()
	DEFAULT_PUSH_TO_HUB = _PUSH_TO_HUB_ENV in ("1", "true", "yes", "y", "on")
	DEFAULT_ATTN_IMPL = os.environ.get("ATTN_IMPL") or "eager"
	DEFAULT_DTYPE = (os.environ.get("DTYPE") or "float16").strip().lower()
	DEFAULT_SEED = int((os.environ.get("SEED") or "42").strip())

	p = argparse.ArgumentParser(
	description=("SFT + LoRA finetune Nemotron-3-Nano-30B-A3B with TRL, suitable for HF Jobs.")
	)

	p.add_argument("--model-id", default="unsloth/Nemotron-3-Nano-30B-A3B")
	p.add_argument("--dataset-name", default="HuggingFaceH4/Multilingual-Thinking")
	p.add_argument("--dataset-split", default="train")

	p.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR)
	p.add_argument(
	"--hub-model-id",
	default=DEFAULT_HUB_MODEL_ID,
	help=(
	"Optional explicit repo id (e.g. 'username/nemo3-sft-lora'). "
	"If omitted and push_to_hub is true, TRL uses output_dir."
	),
	)
	p.add_argument("--push-to-hub", action="store_true", default=DEFAULT_PUSH_TO_HUB)

	p.add_argument("--max-steps", type=int, default=30)
	p.add_argument("--per-device-train-batch-size", type=int, default=1)
	p.add_argument("--gradient-accumulation-steps", type=int, default=4)
	p.add_argument("--warmup-steps", type=int, default=5)
	p.add_argument("--learning-rate", type=float, default=2e-4)
	p.add_argument("--max-length", type=int, default=128)
	p.add_argument("--logging-steps", type=int, default=1)

	p.add_argument(
	"--attn-implementation",
	default=DEFAULT_ATTN_IMPL,
	help="e.g. 'eager' or 'flash_attention_2' (if supported).",
	)
	p.add_argument(
	"--dtype",
	default=DEFAULT_DTYPE,
	choices=["float16", "bfloat16"],
	)
	p.add_argument("--seed", type=int, default=DEFAULT_SEED)

	cfg = p.parse_args()

	# HF Jobs best practice: keep caches in writable workspace.
	os.environ.setdefault("HF_HOME", os.path.abspath("./.hf_home"))
	os.environ.setdefault(
	"TRANSFORMERS_CACHE",
	os.path.abspath("./.hf_home/transformers"),
	)
	os.environ.setdefault(
	"HF_DATASETS_CACHE",
	os.path.abspath("./.hf_home/datasets"),
	)

	import torch # type: ignore[import-not-found]
	from datasets import load_dataset # type: ignore[import-not-found]
	from peft import LoraConfig # type: ignore[import-not-found]
	from transformers import ( # type: ignore[import-not-found]
	AutoModelForCausalLM,
	AutoTokenizer,
	)
	from trl import SFTConfig, SFTTrainer # type: ignore[import-not-found]

	torch.manual_seed(cfg.seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(cfg.seed)
	torch.backends.cuda.matmul.allow_tf32 = True

	torch_dtype = torch.float16 if cfg.dtype == "float16" else torch.bfloat16

	print(f"[config] model_id={cfg.model_id}")
	print(f"[config] dataset={cfg.dataset_name}:{cfg.dataset_split}")
	print(f"[config] output_dir={cfg.output_dir} push_to_hub={cfg.push_to_hub} hub_model_id={cfg.hub_model_id}")

	tokenizer = AutoTokenizer.from_pretrained(
	cfg.model_id,
	trust_remote_code=True,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	cfg.model_id,
	attn_implementation=cfg.attn_implementation,
	torch_dtype=torch_dtype,
	use_cache=False, # training-friendly
	trust_remote_code=True,
	device_map="auto",
	low_cpu_mem_usage=True,
	)

	# LoRA adapters (update target_modules as needed per architecture).
	peft_config = LoraConfig(
	r=8,
	lora_alpha=16,
	target_modules=[
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj",
	],
	)

	dataset = load_dataset(cfg.dataset_name, split=cfg.dataset_split)
	# The original notebook removed some known columns. Do it only if present.
	drop_cols = [
	c
	for c in [
	"reasoning_language",
	"developer",
	"user",
	"analysis",
	"final",
	]
	if c in dataset.column_names
	]
	if drop_cols:
	dataset = dataset.remove_columns(column_names=drop_cols)

	dataset = dataset.map(merge_thinking_into_content)

	def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
	convos = examples["messages"]
	texts = [
	tokenizer.apply_chat_template(
	convo,
	tokenize=False,
	add_generation_prompt=False,
	)
	for convo in convos
	]
	return {"text": texts}

	dataset = dataset.map(
	formatting_prompts_func,
	batched=True,
	remove_columns=[c for c in dataset.column_names if c != "text"],
	)

	# Just report "none" (disable any experiment tracking/reporters).
	report_to: List[str] = []

	training_args = SFTConfig(
	per_device_train_batch_size=cfg.per_device_train_batch_size,
	gradient_accumulation_steps=cfg.gradient_accumulation_steps,
	warmup_steps=cfg.warmup_steps,
	max_steps=cfg.max_steps,
	learning_rate=cfg.learning_rate,
	optim="paged_adamw_8bit",
	logging_steps=cfg.logging_steps,
	report_to=report_to,
	output_dir=cfg.output_dir,
	max_length=cfg.max_length,
	activation_offloading=True,
	gradient_checkpointing=True,
	gradient_checkpointing_kwargs={"use_reentrant": False},
	push_to_hub=cfg.push_to_hub,
	hub_model_id=cfg.hub_model_id,
	)

	trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	peft_config=peft_config,
	processing_class=tokenizer,
	)

	trainer.train()

	# Save LoRA adapters + tokenizer.
	trainer.save_model(cfg.output_dir)
	tokenizer.save_pretrained(cfg.output_dir)

	if cfg.push_to_hub:
	# Explicit push for clarity (in addition to TRL's configured push
	# behavior).
	print("[hub] pushing to hub...")
	trainer.push_to_hub()

	print("[done] training complete")


	if __name__ == "__main__":
	main()