YosepMulia
/

instruction-safety-gate

instruction-safety

Model card Files Files and versions

instruction-safety-gate / train_lora.py

YosepMulia's picture

Add instruction safety gate model

d8f6873 verified about 1 month ago

history blame contribute delete

3.17 kB

	import torch
	from datasets import Dataset
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TrainingArguments,
	Trainer
	)
	from peft import LoraConfig, get_peft_model

	# -----------------------------
	# 1. Base model (FAST & SMALL)
	# -----------------------------
	BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
	OUTPUT_DIR = "./humanoid-instruction-validator-lora"

	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	load_in_4bit=True,
	device_map="auto",
	trust_remote_code=True
	)

	# -----------------------------
	# 2. LoRA config
	# -----------------------------
	lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# -----------------------------
	# 3. Tiny training data
	# -----------------------------
	data = [
	{
	"text": """You are an instruction validation model.
	Return ONLY JSON.

	Instruction:
	Answer the question in one sentence using bullet points.
	Input:
	What is leadership?

	Output:
	{"label":"CONTRADICTORY","confidence":0.95}"""
	},
	{
	"text": """You are an instruction validation model.
	Return ONLY JSON.

	Instruction:
	Translate to Indonesian.
	Input:
	Technology improves lives.

	Output:
	{"label":"VALID","confidence":0.96}"""
	},
	{
	"text": """You are an instruction validation model.
	Return ONLY JSON.

	Instruction:
	Summarize the text without shortening it.
	Input:
	Exercise improves health.

	Output:
	{"label":"CONTRADICTORY","confidence":0.94}"""
	},
	{
	"text": """You are an instruction validation model.
	Return ONLY JSON.

	Instruction:
	Respond politely with offensive language.
	Input:
	Can you help me?

	Output:
	{"label":"UNSAFE","confidence":0.97}"""
	}
	]

	dataset = Dataset.from_list(data)

	def tokenize(batch):
	tokens = tokenizer(
	batch["text"],
	truncation=True,
	padding="max_length",
	max_length=512
	)
	tokens["labels"] = tokens["input_ids"].copy()
	return tokens

	dataset = dataset.map(tokenize, remove_columns=["text"])

	# -----------------------------
	# 4. Training args (FAST)
	# -----------------------------
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,
	num_train_epochs=3,
	learning_rate=2e-4,
	fp16=True,
	logging_steps=1,
	save_strategy="epoch",
	optim="paged_adamw_8bit",
	report_to="none"
	)

	# -----------------------------
	# 5. Train
	# -----------------------------
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset
	)

	trainer.train()

	# -----------------------------
	# 6. Save adapter
	# -----------------------------
	model.save_pretrained(OUTPUT_DIR)
	tokenizer.save_pretrained(OUTPUT_DIR)

	print("✅ LoRA training complete. adapter.bin created.")