File size: 2,086 Bytes
4f8366b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 18 13:18:59 2023

@author: peter
"""

import tokenizers
import datasets
import pandas

class Preprocessor(object):
    
    def __init__(self,tokenizer_path='roberta-base'):
        """
        Creates the preporcessor

        Parameters
        ----------
        tokenizer_path : str, optional
            The path to the pretrained tokenizer . The default is 'roberta-base'.

        Returns
        -------
        None.

        """
        self.tokenizer = tokenizers.Tokenizer.from_pretrained(tokenizer_path)
        self.start_token = self.tokenizer.encode('<s>')
        self.end_token = self.tokenizer.encode('</s>')
        
    def __call__(self,data):
        """
        Tokenizes a column of data

        Parameters
        ----------
        data : pandas.Series
            Column of text tata

        Returns
        -------
        list[tokenizers.Encoding]
            Tokenized data

        """
        return self.tokenizer.encode_batch(data,add_special_tokens=False)
    
    
    def combine(self,*args):
        """
        Tokenises several data columns 

        Parameters
        ----------
        *args : sequence of pandas.Series
            .

        Returns
        -------
        TYPE
            DESCRIPTION.

        """
        return self(pandas.concatenate(args))
    
    def process_labels(self,data,column):
        """
        Converts labels to numerical value for consitency objective

        Parameters
        ----------
        data : datasets.Dataset
            dataset for which labels need to be converted
        column : str
            The column on which to apply label conversion

        Returns
        -------
        datasets.Dataset
            The dataset with the labels converted

        """
        label_values = {'entailment':1.0,
                        'neutral':0.0,
                        'contradiction':-1.0}
        return data.align_labels_with_mapping(label_values,
                                              column)