Spaces:
Sleeping
Sleeping
File size: 1,444 Bytes
04a8e34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
model_id = "bigcode/starcoderbase-7b"
dataset_repo = "Percy3822/python_ai_coder" # Your HF dataset repo
# Load dataset
dataset = load_dataset(dataset_repo, split="train")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
# Tokenize
def tokenize(example):
result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
return result
tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training config
args = TrainingArguments(
output_dir="./python-ai-model",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
num_train_epochs=3,
logging_steps=50,
save_steps=500,
save_total_limit=2,
evaluation_strategy="no",
fp16=True,
push_to_hub=True,
hub_model_id="Percy3822/python_ai_coder",
hub_token="<your_HF_token_here>" # Optional if you run in a linked HF Space
)
trainer = Trainer(
model=model,
train_dataset=tokenized,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
)
trainer.train()
trainer.push_to_hub() |