File size: 1,444 Bytes
04a8e34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_id = "bigcode/starcoderbase-7b"
dataset_repo = "Percy3822/python_ai_coder"  # Your HF dataset repo

# Load dataset
dataset = load_dataset(dataset_repo, split="train")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

# Tokenize
def tokenize(example):
    result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
    return result

tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training config
args = TrainingArguments(
    output_dir="./python-ai-model",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    fp16=True,
    push_to_hub=True,
    hub_model_id="Percy3822/python_ai_coder",
    hub_token="<your_HF_token_here>"  # Optional if you run in a linked HF Space
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
)

trainer.train()
trainer.push_to_hub()