Percy3822 commited on
Commit
23b4c59
·
verified ·
1 Parent(s): c1c8351

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +31 -29
train.py CHANGED
@@ -1,42 +1,44 @@
1
  from datasets import load_dataset
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
3
 
4
- # Load dataset from jsonl
5
  dataset = load_dataset("json", data_files="python.jsonl")
6
 
7
  # Load tokenizer and model
8
- model_name = "distilgpt2"
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- tokenizer.pad_token = tokenizer.eos_token # Fix for padding error
11
- model = AutoModelForCausalLM.from_pretrained(model_name)
12
-
13
- # Tokenization and label setup for causal LM
14
- def preprocess_function(examples):
15
- full_text = examples["prompt"] + examples["completion"]
16
- model_inputs = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
17
- model_inputs["labels"] = model_inputs["input_ids"].copy() # Important for loss calculation
18
- return model_inputs
19
-
20
- # Apply preprocessing
21
- tokenized_dataset = dataset["train"].map(preprocess_function)
22
-
23
- # Training configuration
24
  training_args = TrainingArguments(
25
- output_dir="trained_model",
26
- learning_rate=2e-5,
27
- per_device_train_batch_size=4,
28
- num_train_epochs=5,
29
- weight_decay=0.01,
30
- save_total_limit=1,
31
- logging_steps=1
32
  )
33
 
34
- # Trainer setup
35
  trainer = Trainer(
36
  model=model,
37
  args=training_args,
38
- train_dataset=tokenized_dataset
 
39
  )
40
 
41
- # Start training
42
- trainer.train()
 
 
 
 
 
1
  from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
 
4
+ # Load your dataset
5
  dataset = load_dataset("json", data_files="python.jsonl")
6
 
7
  # Load tokenizer and model
8
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
9
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
10
+
11
+ # Add padding token if missing
12
+ tokenizer.pad_token = tokenizer.eos_token
13
+ model.config.pad_token_id = tokenizer.pad_token_id
14
+
15
+ # Tokenize data
16
+ def tokenize_function(example):
17
+ full_text = example["prompt"] + example["completion"]
18
+ return tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
19
+
20
+ tokenized_dataset = dataset["train"].map(tokenize_function)
21
+
22
+ # Training config
 
23
  training_args = TrainingArguments(
24
+ output_dir="./results",
25
+ per_device_train_batch_size=2,
26
+ num_train_epochs=1,
27
+ logging_steps=10,
28
+ save_strategy="no",
 
 
29
  )
30
 
31
+ # Trainer
32
  trainer = Trainer(
33
  model=model,
34
  args=training_args,
35
+ train_dataset=tokenized_dataset,
36
+ tokenizer=tokenizer,
37
  )
38
 
39
+ # Train
40
+ trainer.train()
41
+
42
+ # ✅ Save model & tokenizer
43
+ trainer.save_model("trained_model")
44
+ tokenizer.save_pretrained("trained_model")