Percy3822 commited on
Commit
2ba6539
·
verified ·
1 Parent(s): f1a2964

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +16 -18
train.py CHANGED
@@ -1,48 +1,46 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
- import os
4
 
5
- # Load dataset
6
  dataset = load_dataset("json", data_files="python.jsonl")
7
 
8
- # Load tokenizer and model
9
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
10
- tokenizer.pad_token = tokenizer.eos_token
11
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
 
 
 
12
  model.config.pad_token_id = tokenizer.pad_token_id
13
 
14
- # Tokenization function
15
- def tokenize_function(example):
16
  full_text = example["prompt"] + example["completion"]
17
- tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
18
  tokens["labels"] = tokens["input_ids"].copy()
19
  return tokens
20
 
21
  # Tokenize dataset
22
- tokenized_dataset = dataset["train"].map(tokenize_function)
23
 
24
- # Training arguments
25
  training_args = TrainingArguments(
26
- output_dir="./results",
27
- per_device_train_batch_size=2,
28
  num_train_epochs=1,
 
29
  logging_steps=10,
30
  save_strategy="epoch",
31
- logging_dir="./logs",
32
- report_to="none"
33
  )
34
 
35
- # Trainer setup
36
  trainer = Trainer(
37
  model=model,
 
38
  args=training_args,
39
  train_dataset=tokenized_dataset,
40
- tokenizer=tokenizer,
41
  )
42
 
43
- # Start training
44
  trainer.train()
45
-
46
- # Save final model
47
  trainer.save_model("trained_model")
48
  tokenizer.save_pretrained("trained_model")
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
 
3
 
4
+ # Load your dataset
5
  dataset = load_dataset("json", data_files="python.jsonl")
6
 
7
+ # Load model and tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 
9
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
10
+
11
+ # Add pad token if needed
12
+ tokenizer.pad_token = tokenizer.eos_token
13
  model.config.pad_token_id = tokenizer.pad_token_id
14
 
15
+ # Tokenize function
16
+ def tokenize(example):
17
  full_text = example["prompt"] + example["completion"]
18
+ tokens = tokenizer(full_text, padding="max_length", truncation=True, max_length=512)
19
  tokens["labels"] = tokens["input_ids"].copy()
20
  return tokens
21
 
22
  # Tokenize dataset
23
+ tokenized_dataset = dataset["train"].map(tokenize)
24
 
25
+ # Training settings
26
  training_args = TrainingArguments(
27
+ output_dir="trained_model",
 
28
  num_train_epochs=1,
29
+ per_device_train_batch_size=2,
30
  logging_steps=10,
31
  save_strategy="epoch",
32
+ push_to_hub=False,
 
33
  )
34
 
35
+ # Trainer
36
  trainer = Trainer(
37
  model=model,
38
+ tokenizer=tokenizer,
39
  args=training_args,
40
  train_dataset=tokenized_dataset,
 
41
  )
42
 
43
+ # Train and save
44
  trainer.train()
 
 
45
  trainer.save_model("trained_model")
46
  tokenizer.save_pretrained("trained_model")