import spacy import random class Delexicalizer: def __init__(self, prob_pos_tag, prob_ner_tag, spacy_model="pt_core_news_sm") -> None: if not spacy_model in spacy.util.get_installed_models(): spacy.cli.download(spacy_model) self.nlp = spacy.load(spacy_model, enable=["parser", "tagger", "ner"]) if prob_pos_tag < 0 or prob_pos_tag > 1: raise ValueError("prob_pos_tag must be between 0 and 1") if prob_ner_tag < 0 or prob_ner_tag > 1: raise ValueError("prob_ner_tag must be between 0 and 1") self.prob_pos_tag = prob_pos_tag self.prob_ner_tag = prob_ner_tag def delexicalize(self, text): doc = self.nlp(text) list_tokens = [] for token in doc: if token.ent_type > 0 and random.uniform(0, 1) < self.prob_ner_tag: list_tokens.append(token.ent_type_) elif random.uniform(0, 1) < self.prob_pos_tag: list_tokens.append(token.pos_) else: list_tokens.append(token.text) return ' '.join(list_tokens)