Spaces:
Sleeping
Sleeping
File size: 2,317 Bytes
04a8e34 01be04f 587575a cc4f041 04a8e34 cc4f041 01be04f 04a8e34 cc4f041 04a8e34 cc4f041 04a8e34 cc4f041 04a8e34 587575a 04a8e34 cc4f041 01be04f 04a8e34 cc4f041 01be04f cc4f041 01be04f cc4f041 01be04f cc4f041 04a8e34 cc4f041 04a8e34 01be04f 04a8e34 01be04f 04a8e34 cc4f041 04a8e34 01be04f cc4f041 01be04f cc4f041 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
import sys
print("π₯ Python AI training script started!", file=sys.stderr)
DATASET_PATH = "python_ai_dataset.jsonl"
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"
# === Step 1: Check dataset ===
if not os.path.exists(DATASET_PATH):
print(f"β Dataset file not found: {DATASET_PATH}", file=sys.stderr)
sys.exit(1)
# === Step 2: Load dataset (first 10 samples for fast test) ===
try:
dataset = load_dataset("json", data_files=DATASET_PATH, split="train[:10]") # Load only 10 samples for testing
except Exception as e:
print(f"β Failed to load dataset: {e}", file=sys.stderr)
sys.exit(1)
# === Step 3: Load tokenizer and model ===
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
except Exception as e:
print(f"β Failed to load model/tokenizer: {e}", file=sys.stderr)
sys.exit(1)
# === Step 4: Preprocess data ===
def tokenize(example):
return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
try:
tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
except Exception as e:
print(f"β Tokenization error: {e}", file=sys.stderr)
sys.exit(1)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# === Step 5: Training config ===
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
overwrite_output_dir=True,
per_device_train_batch_size=1,
num_train_epochs=1,
logging_dir="./logs",
logging_steps=1,
save_strategy="epoch",
save_total_limit=1,
fp16=False,
report_to="none"
)
# === Step 6: Train the model ===
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
print("π Starting training on 10 samples...", file=sys.stderr)
trainer.train()
# === Step 7: Save model ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("β
Training finished and model saved!", file=sys.stderr) |