LCA-PORVID
/

code

Model card Files Files and versions

code / pt_variety_identifier /src /delexicalizer.py

LCA-PORVID's picture

Upload 34 files

ebdb5af verified about 2 years ago

history blame contribute delete

1.11 kB

	import spacy
	import random


	class Delexicalizer:
	def __init__(self, prob_pos_tag, prob_ner_tag, spacy_model="pt_core_news_sm") -> None:

	if not spacy_model in spacy.util.get_installed_models():
	spacy.cli.download(spacy_model)

	self.nlp = spacy.load(spacy_model, enable=["parser", "tagger", "ner"])

	if prob_pos_tag < 0 or prob_pos_tag > 1:
	raise ValueError("prob_pos_tag must be between 0 and 1")

	if prob_ner_tag < 0 or prob_ner_tag > 1:
	raise ValueError("prob_ner_tag must be between 0 and 1")

	self.prob_pos_tag = prob_pos_tag
	self.prob_ner_tag = prob_ner_tag

	def delexicalize(self, text):
	doc = self.nlp(text)

	list_tokens = []

	for token in doc:

	if token.ent_type > 0 and random.uniform(0, 1) < self.prob_ner_tag:
	list_tokens.append(token.ent_type_)

	elif random.uniform(0, 1) < self.prob_pos_tag:
	list_tokens.append(token.pos_)

	else:
	list_tokens.append(token.text)

	return ' '.join(list_tokens)