File size: 5,667 Bytes
e261fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c63404
e261fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c63404
 
e261fbe
 
 
 
 
 
0c63404
e261fbe
0c63404
e261fbe
0c63404
 
e261fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c63404
e261fbe
 
0c63404
e261fbe
0c63404
e261fbe
 
 
 
0c63404
e261fbe
 
 
 
 
 
 
0c63404
e261fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

def main():
    # Configuration
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"  # Using 0.5B as 0.6B doesn't exist
    output_dir = "./qwen-codeforces-cots"
    max_seq_length = 1024  # Reduced from 2048 to save memory

    # Detect device - prefer CUDA for GPU training
    if torch.cuda.is_available():
        device = "cuda"
        use_fp16 = True
        print(f"Using device: CUDA ({torch.cuda.get_device_name(0)})")
    else:
        device = "cpu"
        use_fp16 = False
        print(f"Using device: CPU (training will be slow)")

    print("Loading dataset...")
    dataset = load_dataset("open-r1/codeforces-cots", split="train")

    # Split into train and eval
    dataset = dataset.train_test_split(test_size=0.05, seed=42)
    train_dataset = dataset["train"]
    eval_dataset = dataset["test"]

    print(f"Train samples: {len(train_dataset)}")
    print(f"Eval samples: {len(eval_dataset)}")

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    print("Loading model...")
    # Use appropriate dtype and device_map based on hardware
    if torch.cuda.is_available():
        from transformers import BitsAndBytesConfig
        # Use 4-bit quantization for efficient GPU training
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
        from peft import prepare_model_for_kbit_training
        model = prepare_model_for_kbit_training(model)
        # Enable gradient checkpointing for memory efficiency
        model.gradient_checkpointing_enable()
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            trust_remote_code=True,
        )
        model.gradient_checkpointing_enable()

    # LoRA config - reduced rank for memory efficiency
    lora_config = LoraConfig(
        r=8,  # Reduced from 16 to save memory
        lora_alpha=16,  # Reduced proportionally
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # Format and tokenize dataset
    def format_and_tokenize(example):
        # Format the chat messages
        text = tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False
        )
        # Tokenize
        tokenized = tokenizer(
            text,
            truncation=True,
            max_length=max_seq_length,
            padding=False,
            return_tensors=None,
        )
        # Add labels for causal language modeling
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    print("Formatting and tokenizing dataset...")
    train_dataset = train_dataset.map(
        format_and_tokenize,
        remove_columns=train_dataset.column_names,
        desc="Formatting train dataset"
    )
    eval_dataset = eval_dataset.map(
        format_and_tokenize,
        remove_columns=eval_dataset.column_names,
        desc="Formatting eval dataset"
    )

    # Data collator for padding
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # We're doing causal LM, not masked LM
    )

    # Training arguments - optimized for T4 GPU
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Keep at 1 for memory safety
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,  # Reduced from 16 to lower memory pressure
        num_train_epochs=1,
        max_steps=1000,  # Limit steps for testing
        learning_rate=2e-4,
        fp16=use_fp16,
        gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
        save_strategy="steps",
        save_steps=200,  # Save more frequently
        eval_strategy="steps",
        eval_steps=200,
        logging_steps=10,
        warmup_steps=50,
        lr_scheduler_type="cosine",
        optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",  # Use 8-bit optimizer on GPU
        report_to="none",
        max_grad_norm=0.3,
        save_total_limit=2,
        load_best_model_at_end=False,  # Disable to avoid loading issues
        dataloader_num_workers=0,  # No multiprocessing for stability
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()

    print("Saving model...")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print("Training complete!")
    print(f"Model saved to: {output_dir}")

if __name__ == "__main__":
    main()