Percy3822 commited on
Commit
01be04f
·
verified ·
1 Parent(s): a0e05b2

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +36 -26
train.py CHANGED
@@ -1,47 +1,57 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 
3
 
4
- model_id = "bigcode/starcoderbase-7b"
5
- dataset_repo = "Percy3822/python_ai_coder" # Your HF dataset repo
 
 
6
 
7
- # Load dataset
8
- dataset = load_dataset(dataset_repo, split="train")
9
 
10
- # Load tokenizer and model
11
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
12
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
13
 
14
- # Tokenize
15
  def tokenize(example):
16
- result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
17
- return result
18
 
19
- tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"])
 
 
20
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
21
 
22
- # Training config
23
- args = TrainingArguments(
24
- output_dir="./python-ai-model",
25
- per_device_train_batch_size=2,
 
26
  gradient_accumulation_steps=4,
27
- num_train_epochs=3,
28
- logging_steps=50,
29
- save_steps=500,
 
30
  save_total_limit=2,
31
- evaluation_strategy="no",
32
  fp16=True,
33
- push_to_hub=True,
34
- hub_model_id="Percy3822/python_ai_coder",
35
- hub_token="<your_HF_token_here>" # Optional if you run in a linked HF Space
36
  )
37
 
 
38
  trainer = Trainer(
39
  model=model,
40
- train_dataset=tokenized,
 
41
  tokenizer=tokenizer,
42
- args=args,
43
- data_collator=data_collator,
44
  )
45
 
 
46
  trainer.train()
47
- trainer.push_to_hub()
 
 
 
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
3
+ import os
4
 
5
+ # === CONFIG ===
6
+ DATASET_PATH = "python_ai_dataset.jsonl" # Your .jsonl file
7
+ MODEL_ID = "bigcode/starcoderbase-7b"
8
+ OUTPUT_DIR = "train_output"
9
 
10
+ # === Load Dataset ===
11
+ dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
12
 
13
+ # === Load Tokenizer and Model ===
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
15
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
16
 
17
+ # === Preprocessing ===
18
  def tokenize(example):
19
+ return tokenizer(example["prompt"] + "\n" + example["completion"],
20
+ truncation=True, max_length=512)
21
 
22
+ tokenized_dataset = dataset.map(tokenize, remove_columns=["prompt", "completion"])
23
+
24
+ # === Data Collator ===
25
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
26
 
27
+ # === Training Arguments ===
28
+ training_args = TrainingArguments(
29
+ output_dir=OUTPUT_DIR,
30
+ overwrite_output_dir=True,
31
+ per_device_train_batch_size=1,
32
  gradient_accumulation_steps=4,
33
+ num_train_epochs=2,
34
+ logging_dir="./logs",
35
+ logging_steps=10,
36
+ save_strategy="epoch",
37
  save_total_limit=2,
 
38
  fp16=True,
39
+ bf16=False,
40
+ report_to="none", # Prevent HF integration logs
 
41
  )
42
 
43
+ # === Trainer ===
44
  trainer = Trainer(
45
  model=model,
46
+ args=training_args,
47
+ train_dataset=tokenized_dataset,
48
  tokenizer=tokenizer,
49
+ data_collator=data_collator
 
50
  )
51
 
52
+ # === Start Training ===
53
  trainer.train()
54
+
55
+ # === Save Final Model ===
56
+ trainer.save_model(OUTPUT_DIR)
57
+ tokenizer.save_pretrained(OUTPUT_DIR)