Percy3822 commited on
Commit
62af157
·
verified ·
1 Parent(s): 34a9738

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +12 -11
train.py CHANGED
@@ -1,37 +1,38 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
 
3
 
4
- # Load your dataset
5
  dataset = load_dataset("json", data_files="python.jsonl")
6
 
7
  # Load tokenizer and model
8
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
9
- model = AutoModelForCausalLM.from_pretrained("distilgpt2")
10
-
11
- # Add padding token if missing
12
  tokenizer.pad_token = tokenizer.eos_token
 
13
  model.config.pad_token_id = tokenizer.pad_token_id
14
 
15
- # Tokenize function
16
  def tokenize_function(example):
17
  full_text = example["prompt"] + example["completion"]
18
  tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
19
  tokens["labels"] = tokens["input_ids"].copy()
20
  return tokens
21
 
22
- # Tokenize
23
  tokenized_dataset = dataset["train"].map(tokenize_function)
24
 
25
- # Training configuration
26
  training_args = TrainingArguments(
27
  output_dir="./results",
28
  per_device_train_batch_size=2,
29
  num_train_epochs=1,
30
  logging_steps=10,
31
- save_strategy="no",
 
 
32
  )
33
 
34
- # Trainer
35
  trainer = Trainer(
36
  model=model,
37
  args=training_args,
@@ -39,9 +40,9 @@ trainer = Trainer(
39
  tokenizer=tokenizer,
40
  )
41
 
42
- # Train
43
  trainer.train()
44
 
45
- # Save model and tokenizer
46
  trainer.save_model("trained_model")
47
  tokenizer.save_pretrained("trained_model")
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
+ import os
4
 
5
+ # Load dataset
6
  dataset = load_dataset("json", data_files="python.jsonl")
7
 
8
  # Load tokenizer and model
9
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 
 
 
10
  tokenizer.pad_token = tokenizer.eos_token
11
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
12
  model.config.pad_token_id = tokenizer.pad_token_id
13
 
14
+ # Tokenization function
15
  def tokenize_function(example):
16
  full_text = example["prompt"] + example["completion"]
17
  tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
18
  tokens["labels"] = tokens["input_ids"].copy()
19
  return tokens
20
 
21
+ # Tokenize dataset
22
  tokenized_dataset = dataset["train"].map(tokenize_function)
23
 
24
+ # Training arguments
25
  training_args = TrainingArguments(
26
  output_dir="./results",
27
  per_device_train_batch_size=2,
28
  num_train_epochs=1,
29
  logging_steps=10,
30
+ save_strategy="epoch",
31
+ logging_dir="./logs",
32
+ report_to="none"
33
  )
34
 
35
+ # Trainer setup
36
  trainer = Trainer(
37
  model=model,
38
  args=training_args,
 
40
  tokenizer=tokenizer,
41
  )
42
 
43
+ # Start training
44
  trainer.train()
45
 
46
+ # Save final model
47
  trainer.save_model("trained_model")
48
  tokenizer.save_pretrained("trained_model")