Spaces:
Build error
Build error
File size: 2,086 Bytes
4f8366b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 18 13:18:59 2023
@author: peter
"""
import tokenizers
import datasets
import pandas
class Preprocessor(object):
def __init__(self,tokenizer_path='roberta-base'):
"""
Creates the preporcessor
Parameters
----------
tokenizer_path : str, optional
The path to the pretrained tokenizer . The default is 'roberta-base'.
Returns
-------
None.
"""
self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
self.start_token = self.tokenizer.encode('<s>')
self.end_token = self.tokenizer.encode('</s>')
def __call__(self,data):
"""
Tokenizes a column of data
Parameters
----------
data : pandas.Series
Column of text tata
Returns
-------
list[tokenizers.Encoding]
Tokenized data
"""
return self.tokenizer.encode_batch(data,add_special_tokens=False)
def combine(self,*args):
"""
Tokenises several data columns
Parameters
----------
*args : sequence of pandas.Series
.
Returns
-------
TYPE
DESCRIPTION.
"""
return self(pandas.concatenate(args))
def process_labels(self,data,column):
"""
Converts labels to numerical value for consitency objective
Parameters
----------
data : datasets.Dataset
dataset for which labels need to be converted
column : str
The column on which to apply label conversion
Returns
-------
datasets.Dataset
The dataset with the labels converted
"""
label_values = {'entailment':1.0,
'neutral':0.0,
'contradiction':-1.0}
return data.align_labels_with_mapping(label_values,
column)
|