Percy3822 commited on
Commit
cc4f041
Β·
verified Β·
1 Parent(s): cf20708

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +37 -24
train.py CHANGED
@@ -1,50 +1,61 @@
1
- ---
2
- # βœ… train.py β€” Trains StarCoder 7B on your dataset
3
-
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
6
  import os
7
  import sys
8
 
9
- print("πŸ”₯ Training script started...", file=sys.stderr)
10
 
11
- # === CONFIG ===
12
- DATASET_PATH = "python_ai_dataset.jsonl" # Must exist in Space root
13
  MODEL_ID = "bigcode/starcoderbase-7b"
14
  OUTPUT_DIR = "train_output"
15
 
16
- # === Load Dataset ===
17
- dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
 
 
 
 
 
 
 
 
 
18
 
19
- # === Load Tokenizer and Model ===
20
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
21
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
 
 
22
 
23
- # === Preprocessing ===
24
  def tokenize(example):
25
  return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
26
 
27
- tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
 
 
 
 
28
 
29
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
30
 
31
- # === Training Args ===
32
  training_args = TrainingArguments(
33
  output_dir=OUTPUT_DIR,
34
  overwrite_output_dir=True,
35
  per_device_train_batch_size=1,
36
- gradient_accumulation_steps=4,
37
- num_train_epochs=2,
38
  logging_dir="./logs",
39
- logging_steps=10,
40
  save_strategy="epoch",
41
- save_total_limit=2,
42
- fp16=True,
43
- bf16=False,
44
- report_to="none",
45
  )
46
 
47
- # === Train ===
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
@@ -53,8 +64,10 @@ trainer = Trainer(
53
  data_collator=data_collator
54
  )
55
 
 
56
  trainer.train()
57
 
58
- # === Save ===
59
  trainer.save_model(OUTPUT_DIR)
60
- tokenizer.save_pretrained(OUTPUT_DIR)
 
 
 
 
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
3
  import os
4
  import sys
5
 
6
+ print("πŸ”₯ Python AI training script started!", file=sys.stderr)
7
 
8
+ DATASET_PATH = "python_ai_dataset.jsonl"
 
9
  MODEL_ID = "bigcode/starcoderbase-7b"
10
  OUTPUT_DIR = "train_output"
11
 
12
+ # === Step 1: Check dataset ===
13
+ if not os.path.exists(DATASET_PATH):
14
+ print(f"❌ Dataset file not found: {DATASET_PATH}", file=sys.stderr)
15
+ sys.exit(1)
16
+
17
+ # === Step 2: Load dataset (first 10 samples for fast test) ===
18
+ try:
19
+ dataset = load_dataset("json", data_files=DATASET_PATH, split="train[:10]") # Load only 10 samples for testing
20
+ except Exception as e:
21
+ print(f"❌ Failed to load dataset: {e}", file=sys.stderr)
22
+ sys.exit(1)
23
 
24
+ # === Step 3: Load tokenizer and model ===
25
+ try:
26
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
27
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
28
+ except Exception as e:
29
+ print(f"❌ Failed to load model/tokenizer: {e}", file=sys.stderr)
30
+ sys.exit(1)
31
 
32
+ # === Step 4: Preprocess data ===
33
  def tokenize(example):
34
  return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
35
 
36
+ try:
37
+ tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
38
+ except Exception as e:
39
+ print(f"❌ Tokenization error: {e}", file=sys.stderr)
40
+ sys.exit(1)
41
 
42
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
43
 
44
+ # === Step 5: Training config ===
45
  training_args = TrainingArguments(
46
  output_dir=OUTPUT_DIR,
47
  overwrite_output_dir=True,
48
  per_device_train_batch_size=1,
49
+ num_train_epochs=1,
 
50
  logging_dir="./logs",
51
+ logging_steps=1,
52
  save_strategy="epoch",
53
+ save_total_limit=1,
54
+ fp16=False,
55
+ report_to="none"
 
56
  )
57
 
58
+ # === Step 6: Train the model ===
59
  trainer = Trainer(
60
  model=model,
61
  args=training_args,
 
64
  data_collator=data_collator
65
  )
66
 
67
+ print("πŸš€ Starting training on 10 samples...", file=sys.stderr)
68
  trainer.train()
69
 
70
+ # === Step 7: Save model ===
71
  trainer.save_model(OUTPUT_DIR)
72
+ tokenizer.save_pretrained(OUTPUT_DIR)
73
+ print("βœ… Training finished and model saved!", file=sys.stderr)