Spaces:

RonniRodriguez
/

YOFO_cost_and_speed_analysis

Sleeping

App Files Files Community

YOFO_cost_and_speed_analysis / src /train.py

RonniRodriguez

Initial commit of YOFO Safety Evaluator

2b259aa 21 days ago

raw

history blame contribute delete

4.66 kB

	"""
	YOFO Training Script.

	This script fine-tunes a language model using the YOFO method.
	It uses LoRA for efficient training on consumer GPUs.

	Key features:
	- Loads mapped YOFO data
	- Uses YOFOTemplateBuilder for correct tokenization
	- Trains with L_answer loss (focusing only on the 12 safety bits)
	- Saves the LoRA adapter
	"""

	import os
	import json
	import torch
	from torch.utils.data import Dataset, DataLoader
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TrainingArguments,
	Trainer,
	DataCollatorForTokenClassification
	)
	from peft import LoraConfig, get_peft_model, TaskType
	from tqdm import tqdm
	import sys

	# Add src to path
	sys.path.append(os.getcwd())
	from src.data.template import YOFOTemplateBuilder

	class YOFODataset(Dataset):
	def __init__(self, data_path, builder):
	self.data = []
	with open(data_path, 'r', encoding='utf-8') as f:
	for line in f:
	self.data.append(json.loads(line))
	self.builder = builder
	print(f"Loaded {len(self.data)} examples from {data_path}")

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	item = self.data[idx]
	# Build the YOFO input
	yofo_input = self.builder.build_template(
	prompt=item['prompt'],
	response=item['response'],
	requirements=item['requirements']
	)

	# Return dict compatible with HuggingFace Trainer
	return {
	"input_ids": yofo_input.input_ids,
	"attention_mask": yofo_input.attention_mask,
	"labels": yofo_input.labels
	}

	def train():
	# --- Configuration ---
	# Using a small, efficient model for demonstration
	# Qwen2.5-1.5B-Instruct is excellent and fits on Colab T4 or standard GPUs
	# You can swap this for Qwen2-VL-2B if you specifically want the VLM from the paper
	MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"

	OUTPUT_DIR = "models/yofo_lora"
	BATCH_SIZE = 4 # Small batch size for consumer GPU
	LEARNING_RATE = 2e-4
	EPOCHS = 3

	print(f"Initializing training with model: {MODEL_ID}")

	# 1. Load Tokenizer & Builder
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	builder = YOFOTemplateBuilder(tokenizer)

	# 2. Load Datasets
	train_dataset = YOFODataset("data/processed/train_yofo.jsonl", builder)
	val_dataset = YOFODataset("data/processed/val_yofo.jsonl", builder)

	# 3. Load Model
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
	device_map="auto",
	trust_remote_code=True
	)

	# 4. Configure LoRA
	peft_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	inference_mode=False,
	r=16, # Rank
	lora_alpha=32,
	lora_dropout=0.05,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
	)

	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters()

	# 5. Setup Trainer
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	num_train_epochs=EPOCHS,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	gradient_accumulation_steps=4,
	learning_rate=LEARNING_RATE,
	weight_decay=0.01,
	logging_steps=10,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	fp16=True, # Use mixed precision
	report_to="none", # Disable wandb for simplicity
	remove_unused_columns=False # Important for custom datasets
	)

	# We need a data collator that handles padding
	# standard default_data_collator might not pad 'labels' correctly with -100
	# DataCollatorForTokenClassification pads labels with -100 by default
	data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	data_collator=data_collator,
	)

	# 6. Train
	print("\n🚀 Starting training...")
	trainer.train()

	# 7. Save
	print(f"\n💾 Saving model to {OUTPUT_DIR}")
	model.save_pretrained(OUTPUT_DIR)
	tokenizer.save_pretrained(OUTPUT_DIR)

	if __name__ == "__main__":
	# Ensure directories exist
	os.makedirs("models", exist_ok=True)
	train()