Spaces:
Paused
Paused
Update train.py
Browse files
train.py
CHANGED
|
@@ -23,8 +23,14 @@ CLIPPING = 1.0
|
|
| 23 |
PUSH_TO_HUB = True
|
| 24 |
|
| 25 |
def load_data():
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def create_tokenizer(training_corpus):
|
| 30 |
tokenizer = ByteLevelBPETokenizer()
|
|
@@ -42,19 +48,23 @@ def get_training_corpus(dataset):
|
|
| 42 |
for i in range(0, len(dataset), 1000):
|
| 43 |
yield dataset[i : i + 1000]["text"]
|
| 44 |
|
| 45 |
-
def format_prompts(examples, tokenizer):
|
| 46 |
texts = []
|
| 47 |
for text in examples['text']:
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
| 57 |
return {"text": texts}
|
|
|
|
| 58 |
|
| 59 |
def create_model(tokenizer):
|
| 60 |
config = LlamaConfig(
|
|
@@ -90,10 +100,10 @@ def configure_tokenizer(tokenizer):
|
|
| 90 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
| 91 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
| 92 |
|
| 93 |
-
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
|
| 94 |
tokenizer.chat_template = chat_template
|
| 95 |
|
| 96 |
-
def train_model(model, tokenizer, dataset, push):
|
| 97 |
args = TrainingArguments(
|
| 98 |
output_dir="model",
|
| 99 |
num_train_epochs=EPOCHS,
|
|
@@ -104,17 +114,17 @@ def train_model(model, tokenizer, dataset, push):
|
|
| 104 |
weight_decay=DECAY,
|
| 105 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
| 106 |
fp16=FP16,
|
| 107 |
-
max_grad_norm=CLIPPING
|
|
|
|
| 108 |
)
|
| 109 |
|
| 110 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
|
| 111 |
-
scheduler =
|
| 112 |
-
optimizer,
|
| 113 |
num_warmup_steps=args.warmup_steps,
|
| 114 |
num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
|
| 115 |
)
|
| 116 |
-
|
| 117 |
-
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer), batched=True)
|
| 118 |
trainer = trl.SFTTrainer(
|
| 119 |
model=model,
|
| 120 |
tokenizer=tokenizer,
|
|
@@ -133,19 +143,22 @@ def train_model(model, tokenizer, dataset, push):
|
|
| 133 |
if push:
|
| 134 |
repo_id = OUTPUT_REPO
|
| 135 |
msg = str(train.training_loss)
|
| 136 |
-
trained_model.push_to_hub(repo_id, commit_message=msg)
|
| 137 |
-
trained_tokenizer.push_to_hub(repo_id, commit_message=msg)
|
| 138 |
else:
|
| 139 |
trained_model.save_pretrained("model")
|
| 140 |
trained_tokenizer.save_pretrained("tokenizer")
|
| 141 |
|
| 142 |
def main(push_to_hub=True):
|
| 143 |
dataset = load_data()
|
| 144 |
-
|
|
|
|
|
|
|
| 145 |
tokenizer = create_tokenizer(training_corpus)
|
| 146 |
configure_tokenizer(tokenizer)
|
| 147 |
model = create_model(tokenizer)
|
| 148 |
-
train_model(model, tokenizer,
|
|
|
|
| 149 |
|
| 150 |
if __name__ == "__main__":
|
| 151 |
main(PUSH_TO_HUB)
|
|
|
|
| 23 |
PUSH_TO_HUB = True
|
| 24 |
|
| 25 |
def load_data():
|
| 26 |
+
pretrain = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
| 27 |
+
pretrain = Dataset.from_generator(lambda: pretrain.take(200000))
|
| 28 |
+
instruct = load_dataset(INSTRUCT_DATASET, split="train").select(range(200000))
|
| 29 |
+
dataset_dict = DatasetDict({
|
| 30 |
+
'pretrain': pretrain,
|
| 31 |
+
'instruct': instruct
|
| 32 |
+
})
|
| 33 |
+
return dataset_dict
|
| 34 |
|
| 35 |
def create_tokenizer(training_corpus):
|
| 36 |
tokenizer = ByteLevelBPETokenizer()
|
|
|
|
| 48 |
for i in range(0, len(dataset), 1000):
|
| 49 |
yield dataset[i : i + 1000]["text"]
|
| 50 |
|
| 51 |
+
def format_prompts(examples, tokenizer, isinst):
|
| 52 |
texts = []
|
| 53 |
for text in examples['text']:
|
| 54 |
+
if isinst:
|
| 55 |
+
conversation = []
|
| 56 |
+
parts = text.split('<|end|>')
|
| 57 |
+
for i in range(0, len(parts) - 1, 2):
|
| 58 |
+
prompt = parts[i].replace("<|user|>", "")
|
| 59 |
+
response = parts[i + 1].replace("<|bot|>", "")
|
| 60 |
+
conversation.append({"role": "user", "content": prompt})
|
| 61 |
+
conversation.append({"role": "assistant", "content": response})
|
| 62 |
+
formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
|
| 63 |
+
texts.append(formatted_conversation)
|
| 64 |
+
else:
|
| 65 |
+
texts.append(text)
|
| 66 |
return {"text": texts}
|
| 67 |
+
|
| 68 |
|
| 69 |
def create_model(tokenizer):
|
| 70 |
config = LlamaConfig(
|
|
|
|
| 100 |
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
|
| 101 |
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
|
| 102 |
|
| 103 |
+
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
|
| 104 |
tokenizer.chat_template = chat_template
|
| 105 |
|
| 106 |
+
def train_model(model, tokenizer, dataset, push, isinst):
|
| 107 |
args = TrainingArguments(
|
| 108 |
output_dir="model",
|
| 109 |
num_train_epochs=EPOCHS,
|
|
|
|
| 114 |
weight_decay=DECAY,
|
| 115 |
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
| 116 |
fp16=FP16,
|
| 117 |
+
max_grad_norm=CLIPPING,
|
| 118 |
+
logging_steps=100
|
| 119 |
)
|
| 120 |
|
| 121 |
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
|
| 122 |
+
scheduler = get_linear_schedule_with_warmup(
|
| 123 |
+
optimizer,
|
| 124 |
num_warmup_steps=args.warmup_steps,
|
| 125 |
num_training_steps=(len(dataset) // args.per_device_train_batch_size) * args.num_train_epochs
|
| 126 |
)
|
| 127 |
+
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer, isinst), batched=True, remove_columns=dataset.column_names)
|
|
|
|
| 128 |
trainer = trl.SFTTrainer(
|
| 129 |
model=model,
|
| 130 |
tokenizer=tokenizer,
|
|
|
|
| 143 |
if push:
|
| 144 |
repo_id = OUTPUT_REPO
|
| 145 |
msg = str(train.training_loss)
|
| 146 |
+
trained_model.push_to_hub(repo_id, commit_message=msg, force=True)
|
| 147 |
+
trained_tokenizer.push_to_hub(repo_id, commit_message=msg, force=True)
|
| 148 |
else:
|
| 149 |
trained_model.save_pretrained("model")
|
| 150 |
trained_tokenizer.save_pretrained("tokenizer")
|
| 151 |
|
| 152 |
def main(push_to_hub=True):
|
| 153 |
dataset = load_data()
|
| 154 |
+
pretrain = dataset['pretrain']
|
| 155 |
+
instruct = dataset['instruct']
|
| 156 |
+
training_corpus = get_training_corpus(pretrain)
|
| 157 |
tokenizer = create_tokenizer(training_corpus)
|
| 158 |
configure_tokenizer(tokenizer)
|
| 159 |
model = create_model(tokenizer)
|
| 160 |
+
train_model(model, tokenizer, pretrain, False, False)
|
| 161 |
+
train_model(model, tokenizer, instruct, push_to_hub, True)
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
main(PUSH_TO_HUB)
|