mgbam commited on
Commit
500ebee
Β·
verified Β·
1 Parent(s): d839150

Upload train_coder.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_coder.py +158 -0
train_coder.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fine-tune Qwen2.5-0.5B to solve competitive programming problems
3
+ with chain-of-thought reasoning using the codeforces-cots dataset.
4
+ """
5
+
6
+ import os
7
+ from datasets import load_dataset
8
+ from transformers import (
9
+ AutoTokenizer,
10
+ AutoModelForCausalLM,
11
+ TrainingArguments,
12
+ Trainer,
13
+ DataCollatorForLanguageModeling
14
+ )
15
+ import torch
16
+
17
+ # Configuration
18
+ MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
19
+ DATASET_NAME = "open-r1/codeforces-cots"
20
+ OUTPUT_DIR = "./qwen-codeforces-coder"
21
+ HF_REPO = "mgbam/qwen-codeforces-coder"
22
+
23
+ print(f"πŸš€ Starting fine-tuning: {MODEL_NAME}")
24
+ print(f"πŸ“Š Dataset: {DATASET_NAME}")
25
+ print(f"πŸ’Ύ Output: {HF_REPO}")
26
+ print()
27
+
28
+ # Load tokenizer and model
29
+ print("Loading tokenizer and model...")
30
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL_NAME,
33
+ torch_dtype=torch.bfloat16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ # Add padding token if not present
39
+ if tokenizer.pad_token is None:
40
+ tokenizer.pad_token = tokenizer.eos_token
41
+ model.config.pad_token_id = tokenizer.eos_token_id
42
+
43
+ # Load and prepare dataset
44
+ print(f"Loading dataset: {DATASET_NAME}...")
45
+ dataset = load_dataset(DATASET_NAME, split="train")
46
+
47
+ # Take a subset for faster training (you can increase this)
48
+ dataset = dataset.select(range(min(1000, len(dataset))))
49
+ print(f"Training on {len(dataset)} examples")
50
+
51
+ # Split into train/eval
52
+ dataset = dataset.train_test_split(test_size=0.1, seed=42)
53
+ train_dataset = dataset["train"]
54
+ eval_dataset = dataset["test"]
55
+
56
+ def format_prompt(example):
57
+ """Format the dataset into instruction-following format."""
58
+ # The dataset has 'problem' and 'solution' fields
59
+ problem = example.get('problem', example.get('text', ''))
60
+ solution = example.get('solution', example.get('output', ''))
61
+
62
+ # Create instruction format
63
+ prompt = f"""<|im_start|>system
64
+ You are a competitive programming expert. Solve problems with clear chain-of-thought reasoning.<|im_end|>
65
+ <|im_start|>user
66
+ {problem}<|im_end|>
67
+ <|im_start|>assistant
68
+ {solution}<|im_end|>"""
69
+
70
+ return {"text": prompt}
71
+
72
+ # Format datasets
73
+ print("Formatting dataset...")
74
+ train_dataset = train_dataset.map(format_prompt, remove_columns=train_dataset.column_names)
75
+ eval_dataset = eval_dataset.map(format_prompt, remove_columns=eval_dataset.column_names)
76
+
77
+ # Tokenize
78
+ def tokenize_function(examples):
79
+ return tokenizer(
80
+ examples["text"],
81
+ truncation=True,
82
+ max_length=2048,
83
+ padding="max_length"
84
+ )
85
+
86
+ print("Tokenizing...")
87
+ train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
88
+ eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
89
+
90
+ # Set format for PyTorch
91
+ train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
92
+ eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
93
+
94
+ # Training arguments
95
+ training_args = TrainingArguments(
96
+ output_dir=OUTPUT_DIR,
97
+ num_train_epochs=3,
98
+ per_device_train_batch_size=4,
99
+ per_device_eval_batch_size=4,
100
+ gradient_accumulation_steps=4,
101
+ learning_rate=2e-5,
102
+ warmup_steps=100,
103
+ logging_steps=10,
104
+ eval_steps=50,
105
+ save_steps=100,
106
+ eval_strategy="steps",
107
+ save_strategy="steps",
108
+ load_best_model_at_end=True,
109
+ metric_for_best_model="eval_loss",
110
+ greater_is_better=False,
111
+ fp16=False,
112
+ bf16=True,
113
+ push_to_hub=True,
114
+ hub_model_id=HF_REPO,
115
+ hub_strategy="every_save",
116
+ report_to=["tensorboard"],
117
+ logging_first_step=True,
118
+ )
119
+
120
+ # Data collator
121
+ data_collator = DataCollatorForLanguageModeling(
122
+ tokenizer=tokenizer,
123
+ mlm=False,
124
+ )
125
+
126
+ # Initialize trainer
127
+ print("Initializing trainer...")
128
+ trainer = Trainer(
129
+ model=model,
130
+ args=training_args,
131
+ train_dataset=train_dataset,
132
+ eval_dataset=eval_dataset,
133
+ data_collator=data_collator,
134
+ )
135
+
136
+ # Train!
137
+ print("\n" + "="*50)
138
+ print("πŸ”₯ Starting training!")
139
+ print("="*50 + "\n")
140
+
141
+ trainer.train()
142
+
143
+ # Save final model
144
+ print("\n" + "="*50)
145
+ print("πŸ’Ύ Saving final model...")
146
+ print("="*50 + "\n")
147
+
148
+ trainer.save_model(OUTPUT_DIR)
149
+ tokenizer.save_pretrained(OUTPUT_DIR)
150
+
151
+ # Push to hub
152
+ print(f"πŸ“€ Pushing to Hub: {HF_REPO}")
153
+ trainer.push_to_hub()
154
+
155
+ print("\n" + "="*50)
156
+ print("βœ… Training complete!")
157
+ print(f"🎯 Model available at: https://huggingface.co/{HF_REPO}")
158
+ print("="*50)