File size: 3,647 Bytes
302b22a 5f0d2a1 302b22a 5f0d2a1 4eff6b5 302b22a 5f0d2a1 4eff6b5 5f0d2a1 4eff6b5 2ce6fb9 4eff6b5 5f0d2a1 8a1d274 5f0d2a1 28a17b6 a22a2fe 28a17b6 a22a2fe 28a17b6 d3ecd31 28a17b6 d3ecd31 28a17b6 5f0d2a1 4eff6b5 28a17b6 5f0d2a1 302b22a 5f0d2a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# /// script
# dependencies = ["trl>=0.12.0", "peft>=0.7.0", "trackio", "torch", "transformers"]
# ///
from datasets import load_dataset
from peft import LoraConfig
from trl.trainer.grpo_trainer import GRPOTrainer, GRPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import trackio
import torch
# Load your fine-tuned model and preference dataset
model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned"
dataset_name = "ligaments-enterprise/sec-data-preferences"
output_model = "ligaments-enterprise/llama3.2-1b-sec-grpo"
# Load dataset
dataset = load_dataset(dataset_name, split="train")
print(f"Loaded {len(dataset)} preference pairs from {dataset_name}")
# Create train/eval split for monitoring
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load the model explicitly
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
device_map="auto"
)
# Configure GRPO training
config = GRPOConfig(
output_dir=output_model,
num_train_epochs=3,
per_device_train_batch_size=1,
per_device_eval_batch_size=8, # Must be divisible by num_generations (default 8)
gradient_accumulation_steps=8, # Effective batch size = 8
learning_rate=1e-6,
# Evaluation and logging
eval_strategy="steps",
eval_steps=50,
logging_steps=10,
save_strategy="steps",
save_steps=100,
# Hub integration
push_to_hub=True,
hub_model_id=output_model,
hub_strategy="every_save",
# Optimization
gradient_checkpointing=True,
bf16=True if torch.cuda.is_bf16_supported() else False,
fp16=False if torch.cuda.is_bf16_supported() else True,
# Trackio monitoring
report_to="trackio",
run_name="llama3.2-1b-sec-grpo-training",
project="ligaments-sec-alignment",
)
# Define reward function for GRPO
def preference_reward_func(**kwargs):
"""Simple reward function based on response length preference"""
# Extract completions from kwargs
completions = kwargs.get('completions', [])
rewards = []
for completion in completions:
# Prefer shorter, more concise responses (addressing verbosity issue)
response_length = len(completion.split())
# Reward shorter responses (up to a reasonable length)
if response_length < 50:
reward = 1.0
elif response_length < 100:
reward = 0.5
else:
reward = 0.0 # Penalize overly verbose responses
rewards.append(reward)
return rewards
# Initialize GRPO trainer
trainer = GRPOTrainer(
model=model,
reward_funcs=[preference_reward_func],
train_dataset=train_dataset,
eval_dataset=eval_dataset,
peft_config=LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
),
args=config,
)
print("Starting GRPO training...")
print(f"Training on {len(train_dataset)} preference pairs")
print(f"Evaluating on {len(eval_dataset)} preference pairs")
print(f"Output model will be saved to: {output_model}")
# Train the model
trainer.train()
# Push final model to Hub
trainer.push_to_hub()
print("GRPO training completed successfully!")
print(f"Final model available at: https://huggingface.co/{output_model}") |