Percy3822 commited on
Commit
587575a
·
verified ·
1 Parent(s): ab5224c

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +12 -9
train.py CHANGED
@@ -1,9 +1,15 @@
 
 
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
3
  import os
 
 
 
4
 
5
  # === CONFIG ===
6
- DATASET_PATH = "python_ai_dataset.jsonl" # Your .jsonl file
7
  MODEL_ID = "bigcode/starcoderbase-7b"
8
  OUTPUT_DIR = "train_output"
9
 
@@ -16,15 +22,13 @@ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
16
 
17
  # === Preprocessing ===
18
  def tokenize(example):
19
- return tokenizer(example["prompt"] + "\n" + example["completion"],
20
- truncation=True, max_length=512)
21
 
22
  tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
23
 
24
- # === Data Collator ===
25
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
26
 
27
- # === Training Arguments ===
28
  training_args = TrainingArguments(
29
  output_dir=OUTPUT_DIR,
30
  overwrite_output_dir=True,
@@ -37,10 +41,10 @@ training_args = TrainingArguments(
37
  save_total_limit=2,
38
  fp16=True,
39
  bf16=False,
40
- report_to="none", # Prevent HF integration logs
41
  )
42
 
43
- # === Trainer ===
44
  trainer = Trainer(
45
  model=model,
46
  args=training_args,
@@ -49,9 +53,8 @@ trainer = Trainer(
49
  data_collator=data_collator
50
  )
51
 
52
- # === Start Training ===
53
  trainer.train()
54
 
55
- # === Save Final Model ===
56
  trainer.save_model(OUTPUT_DIR)
57
  tokenizer.save_pretrained(OUTPUT_DIR)
 
1
+ ---
2
+ # ✅ train.py — Trains StarCoder 7B on your dataset
3
+
4
  from datasets import load_dataset
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
6
  import os
7
+ import sys
8
+
9
+ print("🔥 Training script started...", file=sys.stderr)
10
 
11
  # === CONFIG ===
12
+ DATASET_PATH = "python_ai_dataset.jsonl" # Must exist in Space root
13
  MODEL_ID = "bigcode/starcoderbase-7b"
14
  OUTPUT_DIR = "train_output"
15
 
 
22
 
23
  # === Preprocessing ===
24
  def tokenize(example):
25
+ return tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
 
26
 
27
  tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
28
 
 
29
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
30
 
31
+ # === Training Args ===
32
  training_args = TrainingArguments(
33
  output_dir=OUTPUT_DIR,
34
  overwrite_output_dir=True,
 
41
  save_total_limit=2,
42
  fp16=True,
43
  bf16=False,
44
+ report_to="none",
45
  )
46
 
47
+ # === Train ===
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
 
53
  data_collator=data_collator
54
  )
55
 
 
56
  trainer.train()
57
 
58
+ # === Save ===
59
  trainer.save_model(OUTPUT_DIR)
60
  tokenizer.save_pretrained(OUTPUT_DIR)