| | import json |
| | import os |
| | from unsloth import FastLanguageModel |
| | from datasets import Dataset |
| | from trl import DPOTrainer, DPOConfig |
| | import torch |
| |
|
| |
|
| | |
| | def load_model(): |
| | print("Initializing model loading...") |
| | model_name = "outputs_sample_code/checkpoint-200" |
| | max_seq_length = 512 |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name, |
| | dtype=None, |
| | load_in_4bit=True |
| | ) |
| | print("Model and tokenizer loaded successfully.") |
| | print(f"Model type: {type(model)}, Tokenizer type: {type(tokenizer)}") |
| |
|
| | if hasattr(model, 'config'): |
| | print("Setting max_seq_length in model.config") |
| | model.config.max_seq_length = max_seq_length |
| | else: |
| | print("Error: model.config does not exist!") |
| |
|
| | model = FastLanguageModel.get_peft_model( |
| | model, |
| | r=32, |
| | target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
| | lora_alpha=32, |
| | lora_dropout=0.05, |
| | bias="none", |
| | use_gradient_checkpointing="unsloth", |
| | random_state=3407, |
| | use_rslora=False, |
| | loftq_config=None, |
| | max_seq_length=max_seq_length |
| | ) |
| | print("PEFT model configured.") |
| | return model, tokenizer |
| |
|
| | |
| | def load_dataset(): |
| | print("Loading dataset...") |
| | dataset_name = "cyberagent/chatbot-arena-ja-calm2-7b-chat-experimental" |
| |
|
| | from datasets import load_dataset |
| | dataset = load_dataset(dataset_name) |
| |
|
| | formatted_data = [] |
| | for item in dataset["train"]: |
| | formatted_data.append({ |
| | "prompt": item.get("prompt", ""), |
| | "chosen": item.get("response_winner", ""), |
| | "rejected": item.get("response_loser", "") |
| | }) |
| |
|
| | print(f"Formatted data: {len(formatted_data)} items") |
| | return Dataset.from_dict({ |
| | "prompt": [item["prompt"] for item in formatted_data], |
| | "chosen": [item["chosen"] for item in formatted_data], |
| | "rejected": [item["rejected"] for item in formatted_data] |
| | }) |
| |
|
| | |
| | def train_dpo(model, tokenizer, dataset): |
| | print("Configuring training arguments...") |
| |
|
| | training_args = DPOConfig( |
| | output_dir="./dpo_trained_model_1216", |
| | overwrite_output_dir=True, |
| | per_device_train_batch_size=8, |
| | gradient_accumulation_steps=128, |
| | per_device_eval_batch_size=8, |
| | learning_rate=1e-5, |
| | weight_decay=0.01, |
| | num_train_epochs=1, |
| | lr_scheduler_type="constant_with_warmup", |
| | warmup_steps=10, |
| | fp16=True, |
| | eval_strategy="steps", |
| | save_strategy="steps", |
| | save_steps=32, |
| | logging_steps=8, |
| | eval_steps=8, |
| | load_best_model_at_end=True, |
| | save_safetensors=False, |
| | save_only_model=True, |
| | remove_unused_columns=False, |
| | ) |
| | print("Training arguments configured.") |
| |
|
| | print("Initializing DPOTrainer...") |
| | dpo_trainer = DPOTrainer( |
| | model=model, |
| | args=training_args, |
| | beta=0.3, |
| | train_dataset=dataset, |
| | eval_dataset=dataset, |
| | tokenizer=tokenizer, |
| | max_prompt_length=162, |
| | max_length=512, |
| | loss_type="sigmoid", |
| | label_smoothing=0.0, |
| | ) |
| | print("DPOTrainer initialized.") |
| |
|
| | print("Starting training...") |
| |
|
| | original_forward = model.forward |
| |
|
| | def new_forward(*args, **kwargs): |
| | if "input_ids" in kwargs: |
| | kwargs["input_ids"] = kwargs["input_ids"].long() |
| | return original_forward(*args, **kwargs) |
| |
|
| | model.forward = new_forward |
| |
|
| | dpo_trainer.train() |
| | print("Training completed.") |
| |
|
| | if __name__ == "__main__": |
| | print("Loading model...") |
| | model, tokenizer = load_model() |
| |
|
| | print("Loading dataset...") |
| | dataset = load_dataset() |
| |
|
| | print("Starting DPO training...") |
| | train_dpo(model, tokenizer, dataset) |
| |
|
| | print("Training complete. Model saved.") |
| |
|