Percy3822 commited on
Commit
04a8e34
·
verified ·
1 Parent(s): 14a5a47

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +47 -0
train.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
3
+
4
+ model_id = "bigcode/starcoderbase-7b"
5
+ dataset_repo = "Percy3822/python_ai_coder" # Your HF dataset repo
6
+
7
+ # Load dataset
8
+ dataset = load_dataset(dataset_repo, split="train")
9
+
10
+ # Load tokenizer and model
11
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
12
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
13
+
14
+ # Tokenize
15
+ def tokenize(example):
16
+ result = tokenizer(example["prompt"] + "\n" + example["completion"], truncation=True, max_length=512)
17
+ return result
18
+
19
+ tokenized = dataset.map(tokenize, remove_columns=["prompt", "completion"])
20
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
21
+
22
+ # Training config
23
+ args = TrainingArguments(
24
+ output_dir="./python-ai-model",
25
+ per_device_train_batch_size=2,
26
+ gradient_accumulation_steps=4,
27
+ num_train_epochs=3,
28
+ logging_steps=50,
29
+ save_steps=500,
30
+ save_total_limit=2,
31
+ evaluation_strategy="no",
32
+ fp16=True,
33
+ push_to_hub=True,
34
+ hub_model_id="Percy3822/python_ai_coder",
35
+ hub_token="<your_HF_token_here>" # Optional if you run in a linked HF Space
36
+ )
37
+
38
+ trainer = Trainer(
39
+ model=model,
40
+ train_dataset=tokenized,
41
+ tokenizer=tokenizer,
42
+ args=args,
43
+ data_collator=data_collator,
44
+ )
45
+
46
+ trainer.train()
47
+ trainer.push_to_hub()