| | import math |
| |
|
| | import torch |
| | from tokenizers import Tokenizer |
| | from transformers import PreTrainedTokenizerFast, get_cosine_schedule_with_warmup |
| |
|
| | from training import PreTrainer |
| | from tynerox.modeling import TyneRoxModel, TyneRoxConfig |
| | from dataset.pre_train import create_train_dataloader |
| |
|
| | if __name__ == "__main__": |
| |
|
| | |
| | tokenizer = Tokenizer.from_file("tokenizer/tokens-bpe-36k.json") |
| | tokenizer = PreTrainedTokenizerFast( |
| | tokenizer_object=tokenizer, |
| | unk_token="[UNK]", |
| | pad_token="<|endoftext|>", |
| | eos_token="<|endoftext|>", |
| | ) |
| |
|
| | tokenizer.save_pretrained(f"../") |
| |
|
| | |
| | config = TyneRoxConfig( |
| | vocab_size=tokenizer.vocab_size, |
| | pad_token_id=tokenizer.pad_token_id, |
| | ) |
| |
|
| | model = TyneRoxModel(config) |
| | model.to("cuda") |
| |
|
| | |
| | folder_path = "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens" |
| | dataloader = create_train_dataloader( |
| | folder_path, |
| | tokenizer, |
| | batch_size=5, |
| | max_length=1024, |
| | drop_last=True, |
| | num_workers=10 |
| | ) |
| |
|
| | |
| | model = torch.compile(model) |
| | optimizer = torch.optim.AdamW( |
| | model.parameters(), |
| | lr=0.000461, |
| | weight_decay=0.1 |
| | ) |
| |
|
| | |
| | epochs = 1 |
| | batch_size = 40 |
| | size_dataset = 2_883_231 |
| | warmup_ratio = 0.05 |
| |
|
| | num_training_steps = len(dataloader) * epochs |
| | num_warmup_steps = math.floor(num_training_steps * warmup_ratio) |
| |
|
| | |
| | scheduler = get_cosine_schedule_with_warmup( |
| | optimizer, |
| | num_warmup_steps=num_warmup_steps, |
| | num_training_steps=num_training_steps, |
| | ) |
| |
|
| | sample_prompts = [ |
| | "Olá, como vai você? ", |
| | "Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que ", |
| | "Não, respondeu; na verdade, estou com medo ", |
| | "O resultado representa uma desaceleração ", |
| | "No vídeo, é possível ver ", |
| | "Essa receita de torta de frango ", |
| | "Durante o primeiro mandato ", |
| | "Os donos de cães " |
| | ] |
| |
|
| | logger_config = { |
| | "tracking_uri": "http://127.0.0.1:5000", |
| | "experiment": "Pre training LLM", |
| | "model_name": "Pre training LLM (Long Context)" |
| | } |
| |
|
| | trainer = PreTrainer( |
| | model=model, |
| | optimizer=optimizer, |
| | scheduler=scheduler, |
| | tokenizer=tokenizer, |
| | train_loader=dataloader, |
| | test_loader=None, |
| | logger_config=logger_config, |
| | use_amp=True |
| | ) |
| |
|
| | trainer.train(num_epochs=epochs,sample_prompts=sample_prompts) |
| |
|
| | |
| | model.save_pretrained(f"../") |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|