| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| import datasets | |
| import os | |
| def main(): | |
| datasets.set_caching_enabled(False) | |
| tokenizer = AutoTokenizer.from_pretrained(r"/tokenizer/loc") | |
| tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) | |
| data_loc = "path/to/review/jsons" | |
| data_files = [fil.path for fil in os.scandir(data_loc)] | |
| dataset = load_dataset('online_reviews_loading.py', data_files=data_files) | |
| def tokenize_function(examples): | |
| return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) | |
| def process_rating(example): | |
| example["labels"] = [float(item) for item in example["rating"]] | |
| return example | |
| dataset = dataset["train"].map(tokenize_function, batched=True).map(process_rating,batched=True,remove_columns=['rating']).shuffle(seed=42).train_test_split(test_size=0.1) | |
| if __name__ == "__main__": | |
| main() |