plentas

Build error

App Files Files Community

plentas / codeScripts /utils.py

jfarray

Duplicate from xiomarablanco/plentas

51a4fb0 almost 3 years ago

raw

history blame

12.1 kB

	import json
	import numpy as np
	import hunspell
	import nltk
	import nltk.corpus
	from nltk.tokenize import sent_tokenize
	from nltk.tokenize import word_tokenize
	from nltk import ne_chunk
	import re
	import yake
	import spacy
	#dic = hunspell.Hunspell('/Users/miguel.r/Desktop/UNIR/PLenTaS/CORPUS/dict_es_ES/es_ES', '/Users/miguel.r/Desktop/es_ES/es_ES.dic')

	nlp = spacy.load('es_core_news_sm') # Paquete spaCy en español (es)

	# Clase creada para contar sílabas de una palabra (Source: https://github.com/amunozf/separasilabas/blob/master/separasilabas.py)

	#class char():
	#def __init__(self):
	# pass

	class char_line():
	def __init__(self, word):
	self.word = word
	self.char_line = [(char, self.char_type(char)) for char in word]
	self.type_line = ''.join(chartype for char, chartype in self.char_line)

	def char_type(self, char):
	if char in set(['a', 'á', 'e', 'é','o', 'ó', 'í', 'ú']):
	return 'V' #strong vowel
	if char in set(['i', 'u', 'ü']):
	return 'v' #week vowel
	if char=='x':
	return 'x'
	if char=='s':
	return 's'
	else:
	return 'c'

	def find(self, finder):
	return self.type_line.find(finder)

	def split(self, pos, where):
	return char_line(self.word[0:pos+where]), char_line(self.word[pos+where:])

	def split_by(self, finder, where):
	split_point = self.find(finder)
	if split_point!=-1:
	chl1, chl2 = self.split(split_point, where)
	return chl1, chl2
	return self, False

	def __str__(self):
	return self.word

	def __repr__(self):
	return repr(self.word)

	class silabizer():
	def __init__(self):
	self.grammar = []

	def split(self, chars):
	rules = [('VV',1), ('cccc',2), ('xcc',1), ('ccx',2), ('csc',2), ('xc',1), ('cc',1), ('vcc',2), ('Vcc',2), ('sc',1), ('cs',1),('Vc',1), ('vc',1), ('Vs',1), ('vs',1)]
	for split_rule, where in rules:
	first, second = chars.split_by(split_rule,where)
	if second:
	if first.type_line in set(['c','s','x','cs']) or second.type_line in set(['c','s','x','cs']):
	#print 'skip1', first.word, second.word, split_rule, chars.type_line
	continue
	if first.type_line[-1]=='c' and second.word[0] in set(['l','r']):
	continue
	if first.word[-1]=='l' and second.word[-1]=='l':
	continue
	if first.word[-1]=='r' and second.word[-1]=='r':
	continue
	if first.word[-1]=='c' and second.word[-1]=='h':
	continue
	return self.split(first)+self.split(second)
	return [chars]

	def __call__(self, word):
	return self.split(char_line(word))

	# Contador número de frases y palabras empleadas en la respuesta
	def check_senteces_words(student_answer):

	# Tokenizing into sentences
	sentences=[]
	words=[]
	letter_per_word=[]
	syll=0 # syllables counter

	TokenizeAnswer = sent_tokenize(student_answer)
	for token in TokenizeAnswer:
	regex = '\\.'
	token = re.sub(regex , '', token)
	sentences.append(token)
	for i in range(len(sentences)):
	word = sentences[i].split(' ')
	for j in range(len(word)):
	words.append(word[j])
	syllables = silabizer()
	syll=syll+len(syllables(word[j]))
	letter_per_word.append(len(word[j]))

	sentencesLenght = len(sentences)
	wordsLenght = (len(words))
	#print(f'Number of senteces used in the answer: {sentencesLenght}')
	#print(f'Number of words used in the answer: {wordsLenght}')

	return sentencesLenght, wordsLenght, syll, letter_per_word

	# Contador faltas de ortografía
	def spelling_corrector(student_answer, hunspell_aff = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES' , hunspell_dic = '/Users/javier.sanz/OneDrive - UNIR/Desktop/PLeNTas_V3/es_ES/es_ES.dic' ):

	dic = hunspell.Hunspell(hunspell_aff, hunspell_dic)
	errors=0
	words = student_answer.split(' ')
	wrong_words = []
	for word in words:
	for element in clean_words(word):
	if not dic.spell(element):
	#print(f'Spelling mistake: {element}')
	wrong_words.append(element)
	errors+=1
	#print(f'Spelling mistakes: {errors}')
	return errors,wrong_words

	# Legibilidad de la respuesta en función del índice Fernández-Huerta
	def FHuertas_index(sentencesLenght, wordsLenght, syll):
	FH = 206.84 - 0.60(syll100/wordsLenght) - 1.02(sentencesLenght100/wordsLenght)
	FH = round(FH, 3)
	legibilidad_fh = ""
	#print(f'\nFernández-Huerta Index: {FH}')
	if 0 < FH <= 30:
	#print('Legibilidad FH: muy difícil.')
	legibilidad_fh = 'muy díficil'
	if 30 < FH <= 50:
	#print('Legibilidad FH: difícil.')
	legibilidad_fh = 'díficil'
	if 50 < FH <= 60:
	#print('Legibilidad FH: ligeramente difícil.')
	legibilidad_fh = 'ligeramente díficil'
	if 60 < FH <= 70:
	#print('Legibilidad FH: adecuado.')
	legibilidad_fh = 'adecuado'
	if 70 < FH <= 80:
	#print('Legibilidad FH: ligeramente fácil.')
	legibilidad_fh = 'ligeramente fácil'
	if 80 < FH <= 90:
	#print('Legibilidad FH: fácil.')
	legibilidad_fh = 'fácil'
	if 90 < FH <= 100:
	#print('Legibilidad FH: muy fácil.')
	legibilidad_fh = 'muy fácil'

	return FH, legibilidad_fh

	# Legibilidad de la respuesta en función del índice mu
	def mu_index(sentencesLenght, wordsLenght, letter_per_word):
	med = np.mean(letter_per_word)
	var = np.var(letter_per_word)
	mu=(wordsLenght/(wordsLenght-1))(med/var)100
	mu=round(mu, 3)

	legibilidad_mu = ""
	#print(f'\nMu index: {mu}')
	if 0 < mu <= 30:
	#print('Legibilidad Mu: muy difícil.')
	legibilidad_mu = 'muy difícil'
	if 30 < mu <= 50:
	#print('Legibilidad Mu: difícil.')
	legibilidad_mu = 'difícil'
	if 50 < mu <= 60:
	#print('Legibilidad Mu: ligeramente difícil.')
	legibilidad_mu = 'ligeramente difícil'
	if 60 < mu <= 70:
	#print('Legibilidad Mu: adecuado.')
	legibilidad_mu = 'adecuado'
	if 70 < mu <= 80:
	#print('Legibilidad Mu: ligeramente fácil.')
	legibilidad_mu = 'ligeramente fácil'
	if 80 < mu <= 90:
	#print('Legibilidad Mu: fácil.')
	legibilidad_mu = 'fácil'
	if 90 < mu <= 100:
	#print('Legibilidad Mu: muy fácil.')
	legibilidad_mu = 'muy fácil'

	return mu, legibilidad_mu

	# Extractor de las kewords de un texto con librería yake
	def keyword_extractor(text, numOfKeywords, language, max_ngram_size,deduplication_threshold = 0.9, features=None):
	test_keywords=[]
	# Deleting special characters and set text in lower case
	regex = '\\\n'
	text = re.sub(regex , ' ', text)
	text = text.lower()
	custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features= features )
	keywords = custom_kw_extractor.extract_keywords(text)
	for kw in keywords:
	test_keywords.append(kw[0])
	return test_keywords

	# categorización de palabras
	def word_categorization(student_answer):
	fileDocument=[]
	TokenizeAnswer = sent_tokenize(student_answer)
	for token in TokenizeAnswer:
	fileDocument.append(token)
	sentencesLenght = len(fileDocument)
	sentence=0
	while sentence < sentencesLenght:
	# Word Tokenize sentence and Tagging the grammer tag to words (verb, noun, adj, etc...)
	word_tokens = word_tokenize(fileDocument[sentence])
	doc = nlp(fileDocument[sentence])
	pre_chunk = [(w.text, w.pos_) for w in doc]
	#print(pre_chunk)
	sentence += 1
	#pre_chunk = nltk.pos_tag(word_tokens)
	tree = ne_chunk(pre_chunk) # same tagging than before
	#grammer_np = ("NP: {<DT>?<JJ>*<NN>}")

	# Chunking rules to filter out:
	grammer_np = ("NP: {<DET>?<ADJ><NOUN><VERB>}")
	grammar = r"""
	NP: {<DT\|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
	{<NNP>+} # chunk sequences of proper nouns
	"""
	chunk_parser = nltk.RegexpParser(grammer_np)
	chunk_result = chunk_parser.parse(tree)

	#..................................................................................................
	def char_split(word, character):
	palabra1=""
	palabra2=""
	found = 0
	for w in word:
	if w == character and not found:
	found = 1
	else:
	if not found:
	palabra1 = palabra1 + w
	else:
	palabra2 = palabra2 + w

	return [palabra1, palabra2]

	def clean_words(string):
	words_sentence = []
	for w in string:
	if not w.isalnum():
	if char_split(string, w)[0] != "":
	words_sentence.append(char_split(string, w)[0])
	string = char_split(string, w)[len(char_split(string, w))-1]

	if string != "":
	words_sentence.append(string)
	return words_sentence

	def getNameFile(string):
	directories = string.split("/")
	return re.sub(".json","", directories[len(directories)-1])


	def getIDrange(rango_ID, df):
	if rango_ID == "All":
	IDs = list(range(len(df['hashed_id'])))
	else:
	rango = []
	r= rango_ID.split(",")
	for i in r:
	c_w= clean_words(i)
	if len(c_w) == 2:
	rango= rango + list(range(int(c_w[0]) -1 ,int(c_w[1])))
	elif len(c_w) == 1:
	rango.append(int(c_w[0]) -1)
	IDs = rango

	return IDs

	def save_json(path, data, isIndent = True):
	if isIndent:
	json_object = json.dumps(data, indent = 11, ensure_ascii= False)
	else:
	json_object = json.dumps(data, ensure_ascii= False)
	# Writing output to a json file
	with open(path, "w") as outfile:
	outfile.write(json_object)


	def load_json(path):
	with open(path, "r", encoding="utf8") as f:
	data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

	return data

	def load_json_dtset(path):
	with open(path, "r", encoding="latin-1") as f:
	data = json.loads("[" + f.read().replace("}\n{", "},\n{") + "]")

	return data


	def splitResponse(respuesta_alumno_raw):
	#pre-processing the student's response
	regex = '\\\n'
	respuesta_alumno = re.sub(regex , ' ', respuesta_alumno_raw)
	respuesta_alumno = respuesta_alumno.lower()

	#stacking each sentence of the student's response
	sentences=[]
	TokenizeAnswer = sent_tokenize(respuesta_alumno)
	for token in TokenizeAnswer:
	regex = '\\.'
	token = re.sub(regex , '', token)
	sentences.append(token)

	return sentences

	def create_file_path(file, doctype):
	"""
	This function is to create relative paths to store data.
	Inputs:
	file: the file or subpath + file where the info is to be stored
	doctype: 1- Info from the api, 2- Output documents, 3- Images, 4- Bert models/documents
	Outputs:
	path: the generated path
	"""
	if doctype == 1:
	path = "api/" + file
	elif doctype == 2:
	path = "archivos/OutputFiles2/" + file
	elif doctype == 3:
	path = "archivos/Images/" + file
	else:
	path = "codeScripts/Dependencies/BERT-models/Prueba3/" + file
	return path