|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datasets import load_dataset |
|
|
from peft import LoraConfig |
|
|
from trl.trainer.grpo_trainer import GRPOTrainer, GRPOConfig |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import trackio |
|
|
import torch |
|
|
|
|
|
|
|
|
model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned" |
|
|
dataset_name = "ligaments-enterprise/sec-data-preferences" |
|
|
output_model = "ligaments-enterprise/llama3.2-1b-sec-grpo" |
|
|
|
|
|
|
|
|
dataset = load_dataset(dataset_name, split="train") |
|
|
print(f"Loaded {len(dataset)} preference pairs from {dataset_name}") |
|
|
|
|
|
|
|
|
dataset_split = dataset.train_test_split(test_size=0.1, seed=42) |
|
|
train_dataset = dataset_split["train"] |
|
|
eval_dataset = dataset_split["test"] |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
|
|
|
config = GRPOConfig( |
|
|
output_dir=output_model, |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=1, |
|
|
per_device_eval_batch_size=8, |
|
|
gradient_accumulation_steps=8, |
|
|
learning_rate=1e-6, |
|
|
|
|
|
|
|
|
eval_strategy="steps", |
|
|
eval_steps=50, |
|
|
logging_steps=10, |
|
|
save_strategy="steps", |
|
|
save_steps=100, |
|
|
|
|
|
|
|
|
push_to_hub=True, |
|
|
hub_model_id=output_model, |
|
|
hub_strategy="every_save", |
|
|
|
|
|
|
|
|
gradient_checkpointing=True, |
|
|
bf16=True if torch.cuda.is_bf16_supported() else False, |
|
|
fp16=False if torch.cuda.is_bf16_supported() else True, |
|
|
|
|
|
|
|
|
report_to="trackio", |
|
|
run_name="llama3.2-1b-sec-grpo-training", |
|
|
project="ligaments-sec-alignment", |
|
|
) |
|
|
|
|
|
|
|
|
def preference_reward_func(**kwargs): |
|
|
"""Simple reward function based on response length preference""" |
|
|
|
|
|
completions = kwargs.get('completions', []) |
|
|
|
|
|
rewards = [] |
|
|
for completion in completions: |
|
|
|
|
|
response_length = len(completion.split()) |
|
|
|
|
|
if response_length < 50: |
|
|
reward = 1.0 |
|
|
elif response_length < 100: |
|
|
reward = 0.5 |
|
|
else: |
|
|
reward = 0.0 |
|
|
rewards.append(reward) |
|
|
return rewards |
|
|
|
|
|
|
|
|
trainer = GRPOTrainer( |
|
|
model=model, |
|
|
reward_funcs=[preference_reward_func], |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=eval_dataset, |
|
|
peft_config=LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM" |
|
|
), |
|
|
args=config, |
|
|
) |
|
|
|
|
|
print("Starting GRPO training...") |
|
|
print(f"Training on {len(train_dataset)} preference pairs") |
|
|
print(f"Evaluating on {len(eval_dataset)} preference pairs") |
|
|
print(f"Output model will be saved to: {output_model}") |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
trainer.push_to_hub() |
|
|
|
|
|
print("GRPO training completed successfully!") |
|
|
print(f"Final model available at: https://huggingface.co/{output_model}") |