File size: 1,112 Bytes
ebdb5af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import spacy
import random


class Delexicalizer:
    def __init__(self, prob_pos_tag, prob_ner_tag, spacy_model="pt_core_news_sm") -> None:

        if not spacy_model in spacy.util.get_installed_models():
            spacy.cli.download(spacy_model)

        self.nlp = spacy.load(spacy_model, enable=["parser", "tagger", "ner"])

        if prob_pos_tag < 0 or prob_pos_tag > 1:
            raise ValueError("prob_pos_tag must be between 0 and 1")

        if prob_ner_tag < 0 or prob_ner_tag > 1:
            raise ValueError("prob_ner_tag must be between 0 and 1")

        self.prob_pos_tag = prob_pos_tag
        self.prob_ner_tag = prob_ner_tag

    def delexicalize(self, text):
        doc = self.nlp(text)

        list_tokens = []

        for token in doc:

            if token.ent_type > 0 and random.uniform(0, 1) < self.prob_ner_tag:
                list_tokens.append(token.ent_type_)

            elif random.uniform(0, 1) < self.prob_pos_tag:
                list_tokens.append(token.pos_)

            else:
                list_tokens.append(token.text)

        return ' '.join(list_tokens)