File size: 4,770 Bytes
14b6e56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3
"""
Sheikh-2.5-Coder Training Script
================================

This script handles the training pipeline for Sheikh-2.5-Coder model.
"""

import os
import torch
import argparse
from typing import Optional
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup,
)
from datasets import load_dataset, Dataset
import yaml
from model import SheikhModel, SheikhConfig, setup_training_args

def load_config(config_path: str) -> dict:
    """Load training configuration from YAML file."""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

def prepare_training_data(data_config: dict) -> Dataset:
    """Prepare training dataset."""
    # This would be implemented based on your specific data sources
    # For now, return a placeholder
    print("Loading training data...")
    
    # Example data preparation logic would go here
    # This might involve loading from Hugging Face datasets
    # or custom data sources
    
    # Placeholder: return empty dataset for now
    train_dataset = Dataset.from_dict({
        'input_ids': [[1, 2, 3, 4, 5]],
        'attention_mask': [[1, 1, 1, 1, 1]],
        'labels': [[2, 3, 4, 5, 6]]
    })
    
    return train_dataset

def setup_model_and_tokenizer(config: dict) -> tuple:
    """Setup model and tokenizer."""
    print("Initializing model and tokenizer...")
    
    # Load tokenizer (would be from a base model for continued training)
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
    
    # Create model configuration
    model_config = SheikhConfig(
        vocab_size=tokenizer.vocab_size,
        hidden_size=config['model']['hidden_size'],
        num_attention_heads=config['model']['num_attention_heads'],
        num_key_value_heads=config['model']['num_key_value_heads'],
        num_hidden_layers=config['model']['num_hidden_layers'],
        intermediate_size=config['model']['intermediate_size'],
        max_position_embeddings=config['model']['context_length'],
    )
    
    # Initialize model
    model = SheikhModel(model_config)
    
    # Resize token embeddings if needed
    model.resize_token_embeddings(len(tokenizer))
    
    # Ensure tokenizer has proper padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

def train_model(
    model,
    tokenizer,
    train_dataset,
    eval_dataset,
    config: dict,
    output_dir: str,
):
    """Train the Sheikh-2.5-Coder model."""
    
    # Setup training arguments
    training_config = config['training']
    args = setup_training_args(
        output_dir=output_dir,
        learning_rate=training_config['learning_rate']
    )
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True,
        return_tensors="pt",
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    
    # Start training
    print("Starting training...")
    trainer.train()
    
    # Save final model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"Training completed! Model saved to {output_dir}")

def main():
    parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model")
    parser.add_argument(
        "--config",
        type=str,
        default="training_config.yaml",
        help="Path to training configuration file"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./sheikh-2.5-coder-output",
        help="Directory to save the trained model"
    )
    parser.add_argument(
        "--resume-from-checkpoint",
        type=str,
        default=None,
        help="Path to checkpoint to resume from"
    )
    
    args = parser.parse_args()
    
    # Load configuration
    config = load_config(args.config)
    
    # Setup model and tokenizer
    model, tokenizer = setup_model_and_tokenizer(config)
    
    # Prepare training data
    train_dataset = prepare_training_data(config['data'])
    eval_dataset = prepare_training_data(config['data'])  # Placeholder
    
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Train model
    train_model(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        config=config,
        output_dir=args.output_dir
    )

if __name__ == "__main__":
    main()