Percy3822 commited on
Commit
10a7fed
·
verified ·
1 Parent(s): 5bab387

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +21 -19
train.py CHANGED
@@ -1,46 +1,48 @@
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
 
 
3
 
4
- # Load your dataset
5
  dataset = load_dataset("json", data_files="python.jsonl")
6
 
7
- # Load model and tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
9
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
10
-
11
- # Add pad token if needed
12
  tokenizer.pad_token = tokenizer.eos_token
13
  model.config.pad_token_id = tokenizer.pad_token_id
14
 
15
- # Tokenize function
16
  def tokenize(example):
17
  full_text = example["prompt"] + example["completion"]
18
- tokens = tokenizer(full_text, padding="max_length", truncation=True, max_length=512)
19
  tokens["labels"] = tokens["input_ids"].copy()
20
  return tokens
21
 
22
- # Tokenize dataset
23
- tokenized_dataset = dataset["train"].map(tokenize)
24
 
25
- # Training settings
26
- training_args = TrainingArguments(
27
- output_dir="trained_model",
28
- num_train_epochs=1,
29
  per_device_train_batch_size=2,
 
30
  logging_steps=10,
31
- save_strategy="epoch",
32
- push_to_hub=False,
33
  )
34
 
35
- # Trainer
36
  trainer = Trainer(
37
  model=model,
 
 
38
  tokenizer=tokenizer,
39
- args=training_args,
40
- train_dataset=tokenized_dataset,
41
  )
42
 
43
- # Train and save
44
  trainer.train()
 
 
45
  trainer.save_model("trained_model")
46
- tokenizer.save_pretrained("trained_model")
 
 
 
 
 
1
  from datasets import load_dataset
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
+ import os
4
+ import shutil
5
 
6
+ # Load dataset
7
  dataset = load_dataset("json", data_files="python.jsonl")
8
 
9
+ # Load tokenizer and model
10
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
11
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
 
 
12
  tokenizer.pad_token = tokenizer.eos_token
13
  model.config.pad_token_id = tokenizer.pad_token_id
14
 
15
+ # Tokenization
16
  def tokenize(example):
17
  full_text = example["prompt"] + example["completion"]
18
+ tokens = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
19
  tokens["labels"] = tokens["input_ids"].copy()
20
  return tokens
21
 
22
+ tokenized = dataset["train"].map(tokenize)
 
23
 
24
+ # Training
25
+ args = TrainingArguments(
26
+ output_dir="./results",
 
27
  per_device_train_batch_size=2,
28
+ num_train_epochs=1,
29
  logging_steps=10,
30
+ save_strategy="no",
 
31
  )
32
 
 
33
  trainer = Trainer(
34
  model=model,
35
+ args=args,
36
+ train_dataset=tokenized,
37
  tokenizer=tokenizer,
 
 
38
  )
39
 
 
40
  trainer.train()
41
+
42
+ # Save model
43
  trainer.save_model("trained_model")
44
+ tokenizer.save_pretrained("trained_model")
45
+
46
+ # Zip the model
47
+ shutil.make_archive("trained_model", 'zip', "trained_model")
48
+ print("✅ Training complete. Model zipped to 'trained_model.zip'")