from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from datasets import load_dataset import torch import os def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, max_length=512, padding="max_length", return_tensors="pt" ) # Initialize model and tokenizer model_name = "bigcode/starcoder2-15b" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, # Use bfloat16 for better memory efficiency device_map="auto" # Automatically handle model parallelism ) # Load and preprocess dataset dataset = load_dataset("officialweaver/code") tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=dataset["train"].column_names ) # Training arguments training_args = TrainingArguments( output_dir="./starcoder-finetuned", num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=100, evaluation_strategy="steps", eval_steps=500, save_strategy="steps", save_steps=500, learning_rate=5e-5, fp16=True, # Enable mixed precision training gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch sizes load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, ) # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], data_collator=DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False # We're doing causal language modeling, not masked ) ) # Train the model trainer.train() # Save the model trainer.save_model("./starcoder-finetuned-final")