File size: 4,770 Bytes
14b6e56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
#!/usr/bin/env python3
"""
Sheikh-2.5-Coder Training Script
================================
This script handles the training pipeline for Sheikh-2.5-Coder model.
"""
import os
import torch
import argparse
from typing import Optional
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForSeq2Seq,
get_linear_schedule_with_warmup,
)
from datasets import load_dataset, Dataset
import yaml
from model import SheikhModel, SheikhConfig, setup_training_args
def load_config(config_path: str) -> dict:
"""Load training configuration from YAML file."""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def prepare_training_data(data_config: dict) -> Dataset:
"""Prepare training dataset."""
# This would be implemented based on your specific data sources
# For now, return a placeholder
print("Loading training data...")
# Example data preparation logic would go here
# This might involve loading from Hugging Face datasets
# or custom data sources
# Placeholder: return empty dataset for now
train_dataset = Dataset.from_dict({
'input_ids': [[1, 2, 3, 4, 5]],
'attention_mask': [[1, 1, 1, 1, 1]],
'labels': [[2, 3, 4, 5, 6]]
})
return train_dataset
def setup_model_and_tokenizer(config: dict) -> tuple:
"""Setup model and tokenizer."""
print("Initializing model and tokenizer...")
# Load tokenizer (would be from a base model for continued training)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
# Create model configuration
model_config = SheikhConfig(
vocab_size=tokenizer.vocab_size,
hidden_size=config['model']['hidden_size'],
num_attention_heads=config['model']['num_attention_heads'],
num_key_value_heads=config['model']['num_key_value_heads'],
num_hidden_layers=config['model']['num_hidden_layers'],
intermediate_size=config['model']['intermediate_size'],
max_position_embeddings=config['model']['context_length'],
)
# Initialize model
model = SheikhModel(model_config)
# Resize token embeddings if needed
model.resize_token_embeddings(len(tokenizer))
# Ensure tokenizer has proper padding token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
def train_model(
model,
tokenizer,
train_dataset,
eval_dataset,
config: dict,
output_dir: str,
):
"""Train the Sheikh-2.5-Coder model."""
# Setup training arguments
training_config = config['training']
args = setup_training_args(
output_dir=output_dir,
learning_rate=training_config['learning_rate']
)
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer=tokenizer,
model=model,
padding=True,
return_tensors="pt",
)
# Initialize trainer
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
# Start training
print("Starting training...")
trainer.train()
# Save final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Training completed! Model saved to {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model")
parser.add_argument(
"--config",
type=str,
default="training_config.yaml",
help="Path to training configuration file"
)
parser.add_argument(
"--output-dir",
type=str,
default="./sheikh-2.5-coder-output",
help="Directory to save the trained model"
)
parser.add_argument(
"--resume-from-checkpoint",
type=str,
default=None,
help="Path to checkpoint to resume from"
)
args = parser.parse_args()
# Load configuration
config = load_config(args.config)
# Setup model and tokenizer
model, tokenizer = setup_model_and_tokenizer(config)
# Prepare training data
train_dataset = prepare_training_data(config['data'])
eval_dataset = prepare_training_data(config['data']) # Placeholder
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Train model
train_model(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
config=config,
output_dir=args.output_dir
)
if __name__ == "__main__":
main() |