gpt-fi / data /fine-tuning /create_online_reviews.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
raw
history blame
986 Bytes
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
import os
def main():
datasets.set_caching_enabled(False)
tokenizer = AutoTokenizer.from_pretrained(r"/tokenizer/loc")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
data_loc = "path/to/review/jsons"
data_files = [fil.path for fil in os.scandir(data_loc)]
dataset = load_dataset('online_reviews_loading.py', data_files=data_files)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
def process_rating(example):
example["labels"] = [float(item) for item in example["rating"]]
return example
dataset = dataset["train"].map(tokenize_function, batched=True).map(process_rating,batched=True,remove_columns=['rating']).shuffle(seed=42).train_test_split(test_size=0.1)
if __name__ == "__main__":
main()