Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """Fine Tuning Numer Two.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1iqPWMaXrktOsY2BwZNdQE8c1B4o1trit | |
| """ | |
| !pip install datasets | |
| !pip install torch | |
| !pip install -q -U transformers accelerate | |
| !pip install transformers[torch] | |
| !pip install accelerate -U | |
| !pip install huggingface_hub | |
| from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer | |
| from datasets import load_dataset | |
| # Load the dataset | |
| dataset = load_dataset("ajibawa-2023/Children-Stories-Collection", trust_remote_code=True) | |
| # Load the pre-trained model and tokenizer | |
| tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") | |
| tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased") | |
| # Tokenize the dataset | |
| def tokenize_function_one(examples): | |
| return tokenizerOne(examples["text"], padding="max_length", truncation=True) | |
| def tokenize_function_two(examples): | |
| return tokenizerTwo(examples["text"], padding="max_length", truncation=True, max_length=512) | |
| tokenizedDatasetOne = dataset.map(tokenize_function_one, batched=True) | |
| shuffled_dataset = tokenizedDatasetOne['train'].shuffle(seed=42) | |
| tokenized_datasets_oneTrain = shuffled_dataset.select(range(10000)) | |
| tokenized_datasets_oneTest = shuffled_dataset.select(range(10000, 12500)) | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15) | |
| training_args = TrainingArguments( | |
| "test_trainer", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=32, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| ) | |
| # Model One: google-bert/bert-base-cased | |
| model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased") | |
| trainer_one = Trainer( | |
| model=model_one, | |
| args=training_args, | |
| train_dataset=tokenized_datasets_oneTrain, | |
| eval_dataset=tokenized_datasets_oneTest, | |
| data_collator=data_collator, | |
| ) | |
| trainer_one.train() | |
| # Get your API token from HuggingFace. | |
| api_token = "redacted" | |
| from transformers import BertConfig, BertModel | |
| model_one.push_to_hub("emma7897/bert_two", token = api_token) | |
| tokenizerOne.push_to_hub("emma7897/bert_two", token = api_token) | |
| tokenizedDatasetTwo = dataset.map(tokenize_function_two, batched=True) | |
| shuffled_dataset = tokenizedDatasetTwo['train'].shuffle(seed=42) | |
| tokenized_datasets_twoTrain = shuffled_dataset.select(range(10000)) | |
| tokenized_datasets_twoTest = shuffled_dataset.select(range(10000, 12500)) | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15) | |
| training_args = TrainingArguments( | |
| "test_trainer", | |
| num_train_epochs=3, | |
| per_device_train_batch_size=32, | |
| per_device_eval_batch_size=32, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| ) | |
| # Model Two: distilbert/distilbert-base-cased | |
| model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased") | |
| trainer_two = Trainer( | |
| model=model_two, | |
| args=training_args, | |
| train_dataset=tokenized_datasets_twoTrain, | |
| eval_dataset=tokenized_datasets_twoTest, | |
| data_collator=data_collator, | |
| ) | |
| trainer_two.train() | |
| from transformers import DistilBertConfig, DistilBertModel | |
| model_two.push_to_hub("emma7897/distilbert_two", token=api_token) | |
| tokenizerTwo.push_to_hub("emma7897/distilbert_two", token=api_token) |