Spaces:
Sleeping
Sleeping
File size: 1,674 Bytes
04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
# === CONFIG ===
DATASET_PATH = "python_ai_dataset.jsonl" # Your .jsonl file
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"
# === Load Dataset ===
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
# === Load Tokenizer and Model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
# === Preprocessing ===
def tokenize(example):
return tokenizer(example["prompt"] + "\n" + example["completion"],
truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
# === Data Collator ===
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# === Training Arguments ===
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
overwrite_output_dir=True,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=2,
logging_dir="./logs",
logging_steps=10,
save_strategy="epoch",
save_total_limit=2,
fp16=True,
bf16=False,
report_to="none", # Prevent HF integration logs
)
# === Trainer ===
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
# === Start Training ===
trainer.train()
# === Save Final Model ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) |