likhonsheikh commited on
Commit
14b6e56
·
verified ·
1 Parent(s): 5139cac

Add train.py

Browse files
Files changed (1) hide show
  1. train.py +171 -0
train.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Sheikh-2.5-Coder Training Script
4
+ ================================
5
+
6
+ This script handles the training pipeline for Sheikh-2.5-Coder model.
7
+ """
8
+
9
+ import os
10
+ import torch
11
+ import argparse
12
+ from typing import Optional
13
+ from transformers import (
14
+ AutoTokenizer,
15
+ AutoModelForCausalLM,
16
+ TrainingArguments,
17
+ Trainer,
18
+ DataCollatorForSeq2Seq,
19
+ get_linear_schedule_with_warmup,
20
+ )
21
+ from datasets import load_dataset, Dataset
22
+ import yaml
23
+ from model import SheikhModel, SheikhConfig, setup_training_args
24
+
25
+ def load_config(config_path: str) -> dict:
26
+ """Load training configuration from YAML file."""
27
+ with open(config_path, 'r') as f:
28
+ return yaml.safe_load(f)
29
+
30
+ def prepare_training_data(data_config: dict) -> Dataset:
31
+ """Prepare training dataset."""
32
+ # This would be implemented based on your specific data sources
33
+ # For now, return a placeholder
34
+ print("Loading training data...")
35
+
36
+ # Example data preparation logic would go here
37
+ # This might involve loading from Hugging Face datasets
38
+ # or custom data sources
39
+
40
+ # Placeholder: return empty dataset for now
41
+ train_dataset = Dataset.from_dict({
42
+ 'input_ids': [[1, 2, 3, 4, 5]],
43
+ 'attention_mask': [[1, 1, 1, 1, 1]],
44
+ 'labels': [[2, 3, 4, 5, 6]]
45
+ })
46
+
47
+ return train_dataset
48
+
49
+ def setup_model_and_tokenizer(config: dict) -> tuple:
50
+ """Setup model and tokenizer."""
51
+ print("Initializing model and tokenizer...")
52
+
53
+ # Load tokenizer (would be from a base model for continued training)
54
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
55
+
56
+ # Create model configuration
57
+ model_config = SheikhConfig(
58
+ vocab_size=tokenizer.vocab_size,
59
+ hidden_size=config['model']['hidden_size'],
60
+ num_attention_heads=config['model']['num_attention_heads'],
61
+ num_key_value_heads=config['model']['num_key_value_heads'],
62
+ num_hidden_layers=config['model']['num_hidden_layers'],
63
+ intermediate_size=config['model']['intermediate_size'],
64
+ max_position_embeddings=config['model']['context_length'],
65
+ )
66
+
67
+ # Initialize model
68
+ model = SheikhModel(model_config)
69
+
70
+ # Resize token embeddings if needed
71
+ model.resize_token_embeddings(len(tokenizer))
72
+
73
+ # Ensure tokenizer has proper padding token
74
+ if tokenizer.pad_token is None:
75
+ tokenizer.pad_token = tokenizer.eos_token
76
+
77
+ return model, tokenizer
78
+
79
+ def train_model(
80
+ model,
81
+ tokenizer,
82
+ train_dataset,
83
+ eval_dataset,
84
+ config: dict,
85
+ output_dir: str,
86
+ ):
87
+ """Train the Sheikh-2.5-Coder model."""
88
+
89
+ # Setup training arguments
90
+ training_config = config['training']
91
+ args = setup_training_args(
92
+ output_dir=output_dir,
93
+ learning_rate=training_config['learning_rate']
94
+ )
95
+
96
+ # Data collator
97
+ data_collator = DataCollatorForSeq2Seq(
98
+ tokenizer=tokenizer,
99
+ model=model,
100
+ padding=True,
101
+ return_tensors="pt",
102
+ )
103
+
104
+ # Initialize trainer
105
+ trainer = Trainer(
106
+ model=model,
107
+ args=args,
108
+ train_dataset=train_dataset,
109
+ eval_dataset=eval_dataset,
110
+ data_collator=data_collator,
111
+ tokenizer=tokenizer,
112
+ )
113
+
114
+ # Start training
115
+ print("Starting training...")
116
+ trainer.train()
117
+
118
+ # Save final model
119
+ trainer.save_model(output_dir)
120
+ tokenizer.save_pretrained(output_dir)
121
+
122
+ print(f"Training completed! Model saved to {output_dir}")
123
+
124
+ def main():
125
+ parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model")
126
+ parser.add_argument(
127
+ "--config",
128
+ type=str,
129
+ default="training_config.yaml",
130
+ help="Path to training configuration file"
131
+ )
132
+ parser.add_argument(
133
+ "--output-dir",
134
+ type=str,
135
+ default="./sheikh-2.5-coder-output",
136
+ help="Directory to save the trained model"
137
+ )
138
+ parser.add_argument(
139
+ "--resume-from-checkpoint",
140
+ type=str,
141
+ default=None,
142
+ help="Path to checkpoint to resume from"
143
+ )
144
+
145
+ args = parser.parse_args()
146
+
147
+ # Load configuration
148
+ config = load_config(args.config)
149
+
150
+ # Setup model and tokenizer
151
+ model, tokenizer = setup_model_and_tokenizer(config)
152
+
153
+ # Prepare training data
154
+ train_dataset = prepare_training_data(config['data'])
155
+ eval_dataset = prepare_training_data(config['data']) # Placeholder
156
+
157
+ # Create output directory
158
+ os.makedirs(args.output_dir, exist_ok=True)
159
+
160
+ # Train model
161
+ train_model(
162
+ model=model,
163
+ tokenizer=tokenizer,
164
+ train_dataset=train_dataset,
165
+ eval_dataset=eval_dataset,
166
+ config=config,
167
+ output_dir=args.output_dir
168
+ )
169
+
170
+ if __name__ == "__main__":
171
+ main()