#!/usr/bin/env python3 """ Sheikh-2.5-Coder Training Script ================================ This script handles the training pipeline for Sheikh-2.5-Coder model. """ import os import torch import argparse from typing import Optional from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, get_linear_schedule_with_warmup, ) from datasets import load_dataset, Dataset import yaml from model import SheikhModel, SheikhConfig, setup_training_args def load_config(config_path: str) -> dict: """Load training configuration from YAML file.""" with open(config_path, 'r') as f: return yaml.safe_load(f) def prepare_training_data(data_config: dict) -> Dataset: """Prepare training dataset.""" # This would be implemented based on your specific data sources # For now, return a placeholder print("Loading training data...") # Example data preparation logic would go here # This might involve loading from Hugging Face datasets # or custom data sources # Placeholder: return empty dataset for now train_dataset = Dataset.from_dict({ 'input_ids': [[1, 2, 3, 4, 5]], 'attention_mask': [[1, 1, 1, 1, 1]], 'labels': [[2, 3, 4, 5, 6]] }) return train_dataset def setup_model_and_tokenizer(config: dict) -> tuple: """Setup model and tokenizer.""" print("Initializing model and tokenizer...") # Load tokenizer (would be from a base model for continued training) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B") # Create model configuration model_config = SheikhConfig( vocab_size=tokenizer.vocab_size, hidden_size=config['model']['hidden_size'], num_attention_heads=config['model']['num_attention_heads'], num_key_value_heads=config['model']['num_key_value_heads'], num_hidden_layers=config['model']['num_hidden_layers'], intermediate_size=config['model']['intermediate_size'], max_position_embeddings=config['model']['context_length'], ) # Initialize model model = SheikhModel(model_config) # Resize token embeddings if needed model.resize_token_embeddings(len(tokenizer)) # Ensure tokenizer has proper padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token return model, tokenizer def train_model( model, tokenizer, train_dataset, eval_dataset, config: dict, output_dir: str, ): """Train the Sheikh-2.5-Coder model.""" # Setup training arguments training_config = config['training'] args = setup_training_args( output_dir=output_dir, learning_rate=training_config['learning_rate'] ) # Data collator data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, model=model, padding=True, return_tensors="pt", ) # Initialize trainer trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, tokenizer=tokenizer, ) # Start training print("Starting training...") trainer.train() # Save final model trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) print(f"Training completed! Model saved to {output_dir}") def main(): parser = argparse.ArgumentParser(description="Train Sheikh-2.5-Coder model") parser.add_argument( "--config", type=str, default="training_config.yaml", help="Path to training configuration file" ) parser.add_argument( "--output-dir", type=str, default="./sheikh-2.5-coder-output", help="Directory to save the trained model" ) parser.add_argument( "--resume-from-checkpoint", type=str, default=None, help="Path to checkpoint to resume from" ) args = parser.parse_args() # Load configuration config = load_config(args.config) # Setup model and tokenizer model, tokenizer = setup_model_and_tokenizer(config) # Prepare training data train_dataset = prepare_training_data(config['data']) eval_dataset = prepare_training_data(config['data']) # Placeholder # Create output directory os.makedirs(args.output_dir, exist_ok=True) # Train model train_model( model=model, tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset, config=config, output_dir=args.output_dir ) if __name__ == "__main__": main()