Percy3822 commited on
Commit
8033fa5
·
verified ·
1 Parent(s): 5de456d

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +18 -31
train.py CHANGED
@@ -1,24 +1,26 @@
1
- from datasets import load_dataset, Dataset
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
- import os
4
 
5
- # Load dataset
6
- dataset = load_dataset("json", data_files="python.jsonl")
7
 
8
  # Load tokenizer and model
9
  model_name = "distilgpt2"
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- tokenizer.pad_token = tokenizer.eos_token
12
  model = AutoModelForCausalLM.from_pretrained(model_name)
13
 
14
- # Tokenize function
15
- def tokenize_function(example):
16
- full_text = f"### Prompt:\n{example['prompt']}\n### Completion:\n{example['completion']}"
17
- return tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
 
 
18
 
19
- tokenized_dataset = dataset["train"].map(tokenize_function)
 
20
 
21
- # Training arguments
22
  training_args = TrainingArguments(
23
  output_dir="trained_model",
24
  learning_rate=2e-5,
@@ -29,27 +31,12 @@ training_args = TrainingArguments(
29
  logging_steps=1
30
  )
31
 
32
- # Trainer
33
  trainer = Trainer(
34
  model=model,
35
  args=training_args,
36
- train_dataset=tokenized_dataset,
37
  )
38
 
39
- # Train
40
- trainer.train()
41
-
42
- # Save and push model to hub
43
- repo_name = "Percy3822/python_coder_100"
44
- trainer.save_model(repo_name)
45
- tokenizer.save_pretrained(repo_name)
46
-
47
- # Optional: push to hub
48
- from huggingface_hub import HfApi
49
- api = HfApi()
50
- api.upload_folder(
51
- folder_path=repo_name,
52
- path_in_repo="",
53
- repo_id=repo_name,
54
- repo_type="model"
55
- )
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
 
3
 
4
+ # Load dataset from jsonl
5
+ dataset = load_dataset("json", data_files="Python.jsonl")
6
 
7
  # Load tokenizer and model
8
  model_name = "distilgpt2"
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ tokenizer.pad_token = tokenizer.eos_token # Fix for padding error
11
  model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
+ # Tokenization and label setup for causal LM
14
+ def preprocess_function(examples):
15
+ full_text = examples["prompt"] + examples["completion"]
16
+ model_inputs = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
17
+ model_inputs["labels"] = model_inputs["input_ids"].copy() # Important for loss calculation
18
+ return model_inputs
19
 
20
+ # Apply preprocessing
21
+ tokenized_dataset = dataset["train"].map(preprocess_function)
22
 
23
+ # Training configuration
24
  training_args = TrainingArguments(
25
  output_dir="trained_model",
26
  learning_rate=2e-5,
 
31
  logging_steps=1
32
  )
33
 
34
+ # Trainer setup
35
  trainer = Trainer(
36
  model=model,
37
  args=training_args,
38
+ train_dataset=tokenized_dataset
39
  )
40
 
41
+ # Start training
42
+ trainer.train()