File size: 2,317 Bytes
04a8e34
 
01be04f
587575a
 
cc4f041
04a8e34
cc4f041
01be04f
 
04a8e34
cc4f041
 
 
 
 
 
 
 
 
 
 
04a8e34
cc4f041
 
 
 
 
 
 
04a8e34
cc4f041
04a8e34
587575a
04a8e34
cc4f041
 
 
 
 
01be04f
04a8e34
 
cc4f041
01be04f
 
 
 
cc4f041
01be04f
cc4f041
01be04f
cc4f041
 
 
04a8e34
 
cc4f041
04a8e34
 
01be04f
 
04a8e34
01be04f
04a8e34
 
cc4f041
04a8e34
01be04f
cc4f041
01be04f
cc4f041
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os
import sys

print("πŸ”₯ Python AI training script started!", file=sys.stderr)

DATASET_PATH = "python_ai_dataset.jsonl"
MODEL_ID = "bigcode/starcoderbase-7b"
OUTPUT_DIR = "train_output"

# === Step 1: Check dataset ===
if not os.path.exists(DATASET_PATH):
    print(f"❌ Dataset file not found: {DATASET_PATH}", file=sys.stderr)
    sys.exit(1)

# === Step 2: Load dataset (first 10 samples for fast test) ===
try:
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train[:10]")  # Load only 10 samples for testing
except Exception as e:
    print(f"❌ Failed to load dataset: {e}", file=sys.stderr)
    sys.exit(1)

# === Step 3: Load tokenizer and model ===
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
except Exception as e:
    print(f"❌ Failed to load model/tokenizer: {e}", file=sys.stderr)
    sys.exit(1)

# === Step 4: Preprocess data ===
def tokenize(example):
    return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)

try:
    tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
except Exception as e:
    print(f"❌ Tokenization error: {e}", file=sys.stderr)
    sys.exit(1)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# === Step 5: Training config ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=1,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=False,
    report_to="none"
)

# === Step 6: Train the model ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("πŸš€ Starting training on 10 samples...", file=sys.stderr)
trainer.train()

# === Step 7: Save model ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("βœ… Training finished and model saved!", file=sys.stderr)