Josh Weaver commited on
Commit
f899866
·
1 Parent(s): baf799d
Files changed (3) hide show
  1. README.md +7 -0
  2. requirements.txt +0 -0
  3. train.py +76 -0
README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # StarCoder Fine-tuning
2
+
3
+ This repository contains the training code for fine-tuning StarCoder on custom code dataset.
4
+
5
+ ## Training
6
+
7
+ This code is designed to run on Hugging Face's training infrastructure.
requirements.txt ADDED
File without changes
train.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoTokenizer,
3
+ AutoModelForCausalLM,
4
+ TrainingArguments,
5
+ Trainer,
6
+ DataCollatorForLanguageModeling
7
+ )
8
+ from datasets import load_dataset
9
+ import torch
10
+ import os
11
+
12
+ def tokenize_function(examples):
13
+ return tokenizer(
14
+ examples["text"],
15
+ truncation=True,
16
+ max_length=512,
17
+ padding="max_length",
18
+ return_tensors="pt"
19
+ )
20
+
21
+ # Initialize model and tokenizer
22
+ model_name = "bigcode/starcoder2-15b"
23
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_name,
26
+ torch_dtype=torch.bfloat16, # Use bfloat16 for better memory efficiency
27
+ device_map="auto" # Automatically handle model parallelism
28
+ )
29
+
30
+ # Load and preprocess dataset
31
+ dataset = load_dataset("officialweaver/code")
32
+ tokenized_dataset = dataset.map(
33
+ tokenize_function,
34
+ batched=True,
35
+ remove_columns=dataset["train"].column_names
36
+ )
37
+
38
+ # Training arguments
39
+ training_args = TrainingArguments(
40
+ output_dir="./starcoder-finetuned",
41
+ num_train_epochs=3,
42
+ per_device_train_batch_size=4,
43
+ per_device_eval_batch_size=4,
44
+ warmup_steps=500,
45
+ weight_decay=0.01,
46
+ logging_dir='./logs',
47
+ logging_steps=100,
48
+ evaluation_strategy="steps",
49
+ eval_steps=500,
50
+ save_strategy="steps",
51
+ save_steps=500,
52
+ learning_rate=5e-5,
53
+ fp16=True, # Enable mixed precision training
54
+ gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch sizes
55
+ load_best_model_at_end=True,
56
+ metric_for_best_model="eval_loss",
57
+ greater_is_better=False,
58
+ )
59
+
60
+ # Initialize trainer
61
+ trainer = Trainer(
62
+ model=model,
63
+ args=training_args,
64
+ train_dataset=tokenized_dataset["train"],
65
+ eval_dataset=tokenized_dataset["validation"],
66
+ data_collator=DataCollatorForLanguageModeling(
67
+ tokenizer=tokenizer,
68
+ mlm=False # We're doing causal language modeling, not masked
69
+ )
70
+ )
71
+
72
+ # Train the model
73
+ trainer.train()
74
+
75
+ # Save the model
76
+ trainer.save_model("./starcoder-finetuned-final")