V1.0
Browse files
app.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
|
| 2 |
+
import torch
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
# Load the pre-trained tokenizer and model
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
|
| 7 |
+
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist"
|
| 11 |
+
|
| 12 |
+
# Prepare the dataset
|
| 13 |
+
train_dataset = TextDataset(
|
| 14 |
+
tokenizer=tokenizer,
|
| 15 |
+
file_path=dataa,
|
| 16 |
+
block_size=128,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Prepare the data collator
|
| 20 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 21 |
+
tokenizer=tokenizer, mlm=False,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Initialize the trainer
|
| 25 |
+
training_args = TrainingArguments(
|
| 26 |
+
# output_dir='./results', # output directory
|
| 27 |
+
num_train_epochs=3, # total number of training epochs
|
| 28 |
+
per_device_train_batch_size=16, # batch size per device during training
|
| 29 |
+
save_steps=10_000, # number of steps between saving checkpoints
|
| 30 |
+
save_total_limit=2, # limit the total amount of checkpoints to save
|
| 31 |
+
prediction_loss_only=True,
|
| 32 |
+
learning_rate=5e-5,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
trainer = Trainer(
|
| 36 |
+
model=model,
|
| 37 |
+
args=training_args,
|
| 38 |
+
train_dataset=train_dataset,
|
| 39 |
+
data_collator=data_collator,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Fine-tune the model
|
| 43 |
+
trainer.train()
|
| 44 |
+
st.write("finished training")
|
| 45 |
+
|
| 46 |
+
# infer
|
| 47 |
+
inputs = tokenizer("<human>: Tell me about youssef khemiri\n<bot>:", return_tensors='pt').to(model.device)
|
| 48 |
+
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
|
| 49 |
+
output_str = tokenizer.decode(outputs[0])
|
| 50 |
+
st.write(output_str)
|
| 51 |
+
|