File size: 4,583 Bytes
0c51b93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os

import torch
import torch._dynamo
import wandb
from jinja2 import Environment, FileSystemLoader
from peft import LoraConfig, PeftModelForSequenceClassification
from torch.nn import MSELoss
from torch.utils.data import random_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

from sotopia_rl.data import RMDataset

torch._dynamo.config.suppress_errors = True

os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


class SotopiaRMTrainer(Trainer):
    def __init__(self, args, accelerator, **kwargs):
        self.args = args
        self.accelerator = accelerator
        self.device = accelerator.device

        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
        train_dataset, eval_dataset = self.setup_dataset(tokenizer)

        # Initialize wandb only on the main process
        if self.accelerator.is_main_process:
            wandb.init(
                project=args.wandb_project,
                name=args.wandb_run_name,
                config={
                    k: v
                    for k, v in vars(args).items()
                    if isinstance(v, (int, float, str))
                },
            )

        peft_config = LoraConfig(
            r=args.lora_r,
            lora_alpha=args.lora_alpha,
            lora_dropout=args.lora_dropout,
            target_modules=args.target_modules.split(","),
        )

        base_model = AutoModelForSequenceClassification.from_pretrained(
            args.model_name,
            num_labels=1,
            torch_dtype="auto",
        )

        # self.model = get_peft_model(base_model, peft_config)
        self.model = PeftModelForSequenceClassification(base_model, peft_config)
        self.model.config.pad_token_id = tokenizer.pad_token_id

        # Set up the TrainingArguments with DeepSpeed support
        training_args = TrainingArguments(
            output_dir=args.checkpoint_dir,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.val_batch_size,
            num_train_epochs=args.num_epochs,
            logging_steps=1,
            save_steps=args.evaluation_steps,
            save_strategy="steps",
            logging_dir="./logs",
            gradient_accumulation_steps=args.accumulation_steps,
            learning_rate=args.learning_rate,
            warmup_steps=int(len(train_dataset) * args.warmup_epochs),
            optim="adamw_torch",
            remove_unused_columns=False,
            dataloader_num_workers=4,
            report_to="wandb",
            ddp_find_unused_parameters=False,
            eval_strategy="steps",
            label_names=["labels"],
        )

        collate_fn = (
            train_dataset.dataset.collate_fn
            if hasattr(train_dataset, "dataset")
            else None
        )

        super().__init__(
            model=self.model,
            args=training_args,
            processing_class=tokenizer,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=collate_fn,
            **kwargs,
        )
        self.loss_fn = MSELoss()

    def setup_dataset(self, tokenizer):
        env = Environment(
            loader=FileSystemLoader("/".join(self.args.template_path.split("/")[:-1]))
        )
        template = env.get_template(self.args.template_path.split("/")[-1])
        dataset = RMDataset(
            self.args.reward_data_path, tokenizer, template, self.args.max_length
        )

        if self.accelerator.is_main_process:
            print(f"dataset: {len(dataset)}")

        train_size = int(len(dataset) * 0.95)
        val_size = len(dataset) - train_size

        # Use deterministic splitter with seed to ensure same split across processes
        generator = torch.Generator().manual_seed(42)
        train_dataset, val_dataset = random_split(
            dataset, [train_size, val_size], generator=generator
        )
        return train_dataset, val_dataset

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        input_ids = inputs["input_ids"]
        attention_masks = inputs["attention_mask"]
        true_rewards = inputs["labels"]

        outputs = model(input_ids, attention_mask=attention_masks)
        predicted_rewards = outputs.logits.squeeze(-1)  # Shape: (batch_size,)
        loss = self.loss_fn(predicted_rewards, true_rewards)

        return (loss, outputs) if return_outputs else loss