File size: 1,692 Bytes
587575a
 
 
04a8e34
 
01be04f
587575a
 
 
04a8e34
01be04f
587575a
01be04f
 
04a8e34
01be04f
 
04a8e34
01be04f
 
 
04a8e34
01be04f
04a8e34
587575a
04a8e34
01be04f
 
04a8e34
 
587575a
01be04f
 
 
 
04a8e34
01be04f
 
 
 
04a8e34
 
01be04f
587575a
04a8e34
 
587575a
04a8e34
 
01be04f
 
04a8e34
01be04f
04a8e34
 
 
01be04f
587575a
01be04f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
---
# βœ… train.py β€” Trains StarCoder 7B on your dataset

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
import sys

print("πŸ”₯ Training script started...", file=sys.stderr)

# === CONFIG ===
DATASET_PATH = "python_ai_dataset.jsonl"  # Must exist in Space root
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"

# === Load Dataset ===
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

# === Load Tokenizer and Model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

# === Preprocessing ===
def tokenize(example):
    return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# === Training Args ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    bf16=False,
    report_to="none",
)

# === Train ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

# === Save ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)