Spaces:
Sleeping
Sleeping
File size: 1,692 Bytes
587575a 04a8e34 01be04f 587575a 04a8e34 01be04f 587575a 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 04a8e34 587575a 04a8e34 01be04f 04a8e34 587575a 01be04f 04a8e34 01be04f 04a8e34 01be04f 587575a 04a8e34 587575a 04a8e34 01be04f 04a8e34 01be04f 04a8e34 01be04f 587575a 01be04f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
---
# β
train.py β Trains StarCoder 7B on your dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
import sys
print("π₯ Training script started...", file=sys.stderr)
# === CONFIG ===
DATASET_PATH = "python_ai_dataset.jsonl" # Must exist in Space root
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"
# === Load Dataset ===
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
# === Load Tokenizer and Model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
# === Preprocessing ===
def tokenize(example):
return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# === Training Args ===
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
overwrite_output_dir=True,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=2,
logging_dir="./logs",
logging_steps=10,
save_strategy="epoch",
save_total_limit=2,
fp16=True,
bf16=False,
report_to="none",
)
# === Train ===
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
trainer.train()
# === Save ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) |