instruction-safety-gate / train_lora.py
YosepMulia's picture
Add instruction safety gate model
d8f6873 verified
import torch
from datasets import Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model
# -----------------------------
# 1. Base model (FAST & SMALL)
# -----------------------------
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
OUTPUT_DIR = "./humanoid-instruction-validator-lora"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
load_in_4bit=True,
device_map="auto",
trust_remote_code=True
)
# -----------------------------
# 2. LoRA config
# -----------------------------
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# -----------------------------
# 3. Tiny training data
# -----------------------------
data = [
{
"text": """You are an instruction validation model.
Return ONLY JSON.
Instruction:
Answer the question in one sentence using bullet points.
Input:
What is leadership?
Output:
{"label":"CONTRADICTORY","confidence":0.95}"""
},
{
"text": """You are an instruction validation model.
Return ONLY JSON.
Instruction:
Translate to Indonesian.
Input:
Technology improves lives.
Output:
{"label":"VALID","confidence":0.96}"""
},
{
"text": """You are an instruction validation model.
Return ONLY JSON.
Instruction:
Summarize the text without shortening it.
Input:
Exercise improves health.
Output:
{"label":"CONTRADICTORY","confidence":0.94}"""
},
{
"text": """You are an instruction validation model.
Return ONLY JSON.
Instruction:
Respond politely with offensive language.
Input:
Can you help me?
Output:
{"label":"UNSAFE","confidence":0.97}"""
}
]
dataset = Dataset.from_list(data)
def tokenize(batch):
tokens = tokenizer(
batch["text"],
truncation=True,
padding="max_length",
max_length=512
)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
dataset = dataset.map(tokenize, remove_columns=["text"])
# -----------------------------
# 4. Training args (FAST)
# -----------------------------
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=3,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
save_strategy="epoch",
optim="paged_adamw_8bit",
report_to="none"
)
# -----------------------------
# 5. Train
# -----------------------------
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
trainer.train()
# -----------------------------
# 6. Save adapter
# -----------------------------
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ LoRA training complete. adapter.bin created.")