File size: 3,647 Bytes

302b22a
5f0d2a1
302b22a
 
 
5f0d2a1
 
4eff6b5
302b22a
5f0d2a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eff6b5
5f0d2a1
 
 
 
4eff6b5
 
 
2ce6fb9
4eff6b5
 
 
5f0d2a1
 
 
 
 
8a1d274
5f0d2a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28a17b6
a22a2fe
28a17b6
a22a2fe
 
 
28a17b6
d3ecd31
28a17b6
d3ecd31
28a17b6
 
 
 
 
 
 
 
 
 
5f0d2a1
 
4eff6b5
28a17b6
5f0d2a1
 
 
 
 
 
 
302b22a
5f0d2a1

# /// script
# dependencies = ["trl>=0.12.0", "peft>=0.7.0", "trackio", "torch", "transformers"]
# ///

from datasets import load_dataset
from peft import LoraConfig
from trl.trainer.grpo_trainer import GRPOTrainer, GRPOConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
import trackio
import torch

# Load your fine-tuned model and preference dataset
model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned"
dataset_name = "ligaments-enterprise/sec-data-preferences"
output_model = "ligaments-enterprise/llama3.2-1b-sec-grpo"

# Load dataset
dataset = load_dataset(dataset_name, split="train")
print(f"Loaded {len(dataset)} preference pairs from {dataset_name}")

# Create train/eval split for monitoring
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the model explicitly
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto"
)

# Configure GRPO training
config = GRPOConfig(
    output_dir=output_model,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=8,  # Must be divisible by num_generations (default 8)
    gradient_accumulation_steps=8,  # Effective batch size = 8
    learning_rate=1e-6,

    # Evaluation and logging
    eval_strategy="steps",
    eval_steps=50,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,

    # Hub integration
    push_to_hub=True,
    hub_model_id=output_model,
    hub_strategy="every_save",

    # Optimization
    gradient_checkpointing=True,
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,

    # Trackio monitoring
    report_to="trackio",
    run_name="llama3.2-1b-sec-grpo-training",
    project="ligaments-sec-alignment",
)

# Define reward function for GRPO
def preference_reward_func(**kwargs):
    """Simple reward function based on response length preference"""
    # Extract completions from kwargs
    completions = kwargs.get('completions', [])

    rewards = []
    for completion in completions:
        # Prefer shorter, more concise responses (addressing verbosity issue)
        response_length = len(completion.split())
        # Reward shorter responses (up to a reasonable length)
        if response_length < 50:
            reward = 1.0
        elif response_length < 100:
            reward = 0.5
        else:
            reward = 0.0  # Penalize overly verbose responses
        rewards.append(reward)
    return rewards

# Initialize GRPO trainer
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[preference_reward_func],
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    ),
    args=config,
)

print("Starting GRPO training...")
print(f"Training on {len(train_dataset)} preference pairs")
print(f"Evaluating on {len(eval_dataset)} preference pairs")
print(f"Output model will be saved to: {output_model}")

# Train the model
trainer.train()

# Push final model to Hub
trainer.push_to_hub()

print("GRPO training completed successfully!")
print(f"Final model available at: https://huggingface.co/{output_model}")