File size: 1,674 Bytes
04a8e34
 
01be04f
04a8e34
01be04f
 
 
 
04a8e34
01be04f
 
04a8e34
01be04f
 
 
04a8e34
01be04f
04a8e34
01be04f
 
04a8e34
01be04f
 
 
04a8e34
 
01be04f
 
 
 
 
04a8e34
01be04f
 
 
 
04a8e34
 
01be04f
 
04a8e34
 
01be04f
04a8e34
 
01be04f
 
04a8e34
01be04f
04a8e34
 
01be04f
04a8e34
01be04f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os

# === CONFIG ===
DATASET_PATH = "python_ai_dataset.jsonl"  # Your .jsonl file
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"

# === Load Dataset ===
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

# === Load Tokenizer and Model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

# === Preprocessing ===
def tokenize(example):
    return tokenizer(example["prompt"] + "\n" + example["completion"],
                     truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])

# === Data Collator ===
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# === Training Arguments ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    bf16=False,
    report_to="none",  # Prevent HF integration logs
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# === Start Training ===
trainer.train()

# === Save Final Model ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)