#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Sep 18 13:18:59 2023 @author: peter """ import tokenizers import datasets import pandas class Preprocessor(object): def __init__(self,tokenizer_path='roberta-base'): """ Creates the preporcessor Parameters ---------- tokenizer_path : str, optional The path to the pretrained tokenizer . The default is 'roberta-base'. Returns ------- None. """ self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path) self.start_token = self.tokenizer.encode('') self.end_token = self.tokenizer.encode('') def __call__(self,data): """ Tokenizes a column of data Parameters ---------- data : pandas.Series Column of text tata Returns ------- list[tokenizers.Encoding] Tokenized data """ return self.tokenizer.encode_batch(data,add_special_tokens=False) def combine(self,*args): """ Tokenises several data columns Parameters ---------- *args : sequence of pandas.Series . Returns ------- TYPE DESCRIPTION. """ return self(pandas.concatenate(args)) def process_labels(self,data,column): """ Converts labels to numerical value for consitency objective Parameters ---------- data : datasets.Dataset dataset for which labels need to be converted column : str The column on which to apply label conversion Returns ------- datasets.Dataset The dataset with the labels converted """ label_values = {'entailment':1.0, 'neutral':0.0, 'contradiction':-1.0} return data.align_labels_with_mapping(label_values, column)