Spaces:

NILC-ICMC-USP
/

VerificaUD

Sleeping

App Files Files Community

VerificaUD / src /verificaUD.py

NILC-ICMC-USP

Update src/verificaUD.py

d433cc6 verified 5 months ago

raw

history blame contribute delete

10.6 kB

	# VerificaUD - Verificador de arquivos CoNLL-U para textos em Português
	#
	# Este programa recebe um arquivo CoNLL-U que contém um corpus
	# em Português e gera um relatório dos problemas encontrados de
	# acordo com regras de verificação definidas:
	# https://sol.sbc.org.br/index.php/stil/article/view/25485
	#
	# Parâmetros obrigatórios:
	# -o <report.txt> --output <report.txt> (arquivo onde o relatório será salvo)
	# <corpus.conllu> (arquivo a ser verificado)
	#
	# Opções:
	# -h --help help
	# -s --struct só verificação estrutural (struct)
	# -t --tagger só verificação estrutural e morfológica (tagger)
	# -p --parser verificação estrutural, morfológica e sintática (parser) - default
	#
	# Exemplo de utilização:
	#
	# verificaUD -o report.txt corpus.conllu
	#
	# Lê o arquivo corpus.conllu e salva o resultado da verificação
	# no arquivo 'report.txt'
	#
	# last edit: 09/03/2025
	# created by Lucelene Lopes - lucelene@gmail.com

	import sys, os
	sys.path.insert(0, 'src/packages')
	from conlluF import conlluFile
	from tagger import checkTerm, taggerDef
	from parser import checkSent, parserDef
	from structure import goodNumbers, wellformedTree, structDef
	from reporter import conlluReports

	#################################################
	### Captura de argumentos da linha de comando
	#################################################
	def parseOptions(arguments):
	# default options
	output_file, input_file, mode = "report.conllu", "corpus.conllu", "-"
	i = 1
	while i < len(arguments):
	if (arguments[i][0] == "-"):
	# ajuda (help) - mostra ajuda, nada é executado
	if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
	(arguments[i] == "-help"):
	print("Opções:\n-h ajuda",
	"Exemplo de utilização:", \
	"verificaUD -o report.conllu corpus.conllu", \
	"Lê o arquivo corpus.conllu e salva o resultado da verificação", \
	"no arquivo 'report.txt'", \
	sep="\n")
	return None
	# opção de arquivo de saída (um nome de arquivo)
	elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
	(arguments[i] == "--output"):
	output_file = arguments[i+1]
	i += 2
	elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \
	(arguments[i] == "--struct"):
	if (mode == "-"):
	mode = "s"
	i += 1
	elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \
	(arguments[i] == "--tagger"):
	if (mode == "-") or (mode == "s"):
	mode = "t"
	i += 1
	elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \
	(arguments[i] == "--parser"):
	if (mode == "-") or (mode == "s") or (mode == "t"):
	mode = "p"
	i += 1
	# opções inválidas - nada é executado
	else:
	print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i]))
	return None
	# arquivo de entrada - só é incluído se existir
	else:
	if (os.path.isfile(arguments[i])):
	input_file = arguments[i]
	i += 1
	else:
	print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
	return None
	if (mode == "-"):
	mode = "p"
	return [output_file, input_file], mode

	########################################
	# Nosso verificador STRUCT
	# input:
	# name - the name of the conllu file
	# report - the name of the textual output file
	########################################
	def struct(name, report, samples):
	dump = open(report, "w")
	base = conlluFile(name)
	s, t = base.getSandT()
	print("Base:", name, "- sentenças:", s, "- tokens:", t, "- STRUCT relat:", report)
	totalS, totalT = 0, 0
	for i in range(base.getS()):
	b = base.getSentByIndex(i)
	acc = goodNumbers(b, dump, samples)
	if (acc == 0):
	acc = wellformedTree(b, dump, samples)
	totalT += acc
	if (acc > 0):
	totalS += 1
	dump.close()
	print("Problemas da Estrutura de CoNLL-U (struct):", totalT)
	print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
	print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
	return s, t, totalT, 0

	########################################
	# Nosso verificador TAGGER
	# input:
	# name - the name of the conllu file
	# report - the name of the textual output file
	########################################
	def tagger(name, report, samples):
	dump = open(report, "w")
	base = conlluFile(name)
	s, t = base.getSandT()
	print("Base:", name, "- sentenças:", s, "- tokens:", t, "- TAGGER relat:", report)
	totalS, totalT = 0, 0
	for i in range(base.getS()):
	b = base.getSentByIndex(i)
	acc = 0
	for tk in b[4]:
	acc += checkTerm(tk, b[0], dump, samples)
	totalT += acc
	if (acc > 0):
	totalS += 1
	dump.close()
	print("Problemas de Formação Lexical (tagger):", totalT)
	print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
	print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
	return s, t, totalT, 0

	########################################
	# Nosso verificador PARSER
	# input:
	# name - the name of the conllu file
	# report - the name of the textual output file
	########################################
	def parser(name, report, samples):
	dump = open(report, "w")
	base = conlluFile(name)
	s, t = base.getSandT()
	print("Base:", name, "- sentenças:", s, "- tokens:", t, "- PARSER relat:", report)
	totalS, totalT, totalW = 0, 0, 0
	for i in range(base.getS()):
	b = base.getSentByIndex(i)
	rules, acc, warn = checkSent(b, dump, samples)
	totalT += acc
	totalW += warn
	if (acc+warn > 0):
	totalS += 1
	####

	dump.close()
	print("Problemas de Dependência Relacional (parser):", totalT)
	print("Avisos de verificação de Dependência Relacional (parser):", totalW)
	print("Sentenças sem problems ou avisos {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
	print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
	print("Tokens sem problems ou avisos {} ({:2.2f}%)".format(t-(totalT+totalW), 100*(t-(totalT+totalW))/t))
	return s, t, totalT, totalW

	########################################
	# mergeReport
	########################################
	def mergeReport(name, corpus, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p):
	issues = []
	probS = []
	avisoS = []
	infile = open(name+"_struct.txt")
	for line in infile:
	issues.append(line[:-1])
	buf = line.split("\t")
	if (buf[0] not in probS):
	probS.append(buf[0])
	infile.close()
	infile = open(name+"_tagger.txt")
	for line in infile:
	issues.append(line[:-1])
	buf = line.split("\t")
	if (buf[0] not in probS):
	probS.append(buf[0])
	infile.close()
	infile = open(name+"_parser.txt")
	for line in infile:
	issues.append(line[:-1])
	if ("Normalmente" not in line):
	buf = line.split("\t")
	if (buf[0] not in probS):
	probS.append(buf[0])
	else:
	buf = line.split("\t")
	if (buf[0] not in avisoS):
	avisoS.append(buf[0])
	infile.close()
	issues.sort()
	print("Arquivo:", corpus+"\n", file=outfile)
	print("sentencas: {:5>} (tokens: {})".format(s,t), file=outfile)
	print("sentencas sem erro: {:5>}".format(s-len(probS)), file=outfile)
	print("sentencas sem erro ou aviso: {:5>}".format(s-len(probS)-len(avisoS)), file=outfile)
	print("erros estruturais: {:5>} - avisos: {:5>}".format(e_s, w_s), file=outfile)
	print("erros lexicais: {:5>} - avisos: {:5>}".format(e_t, w_t), file=outfile)
	print("erros sintáticos: {:5>} - avisos: {:5>}".format(e_p, w_p), file=outfile)

	print("\nErros e avisos encontrados:", file=outfile)
	for i in issues:
	buf = i.split("\t")
	print("{:15}\ttoken {:3}\t{}".format(buf[0], buf[1], buf[2]), file=outfile)

	########################################
	# Do it all
	########################################
	def doIt(name, outfile, mode):
	if (name[-7:] != ".conllu"):
	outputfile = name
	else:
	outputfile = name[:-7]
	samples = conlluReports("samples", [structDef,taggerDef,parserDef])
	if (mode in ["s", "t", "p"]):
	print("Executando verificação estrutural...")
	s, t, e_s, w_s = struct(name, outputfile+"_struct.txt", samples)
	else:
	s, t, e_s, w_s = 0,0,0,0
	if (mode in ["t", "p"]):
	print("Executando verificação de tagger...")
	s, t, e_t, w_t = tagger(name, outputfile+"_tagger.txt", samples)
	else:
	s, t, e_t, w_t = 0,0,0,0
	if (mode in ["p"]):
	print("Executando verificação de parser...")
	s, t, e_p, w_p = parser(name, outputfile+"_parser.txt", samples)
	else:
	s, t, e_p, w_p = 0,0,0,0
	samples.closeAll()
	mergeReport(outputfile, name, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p)
	return e_s+e_t+e_p, w_s+w_t+w_p


	########################################
	# Main function - verificaUD
	########################################
	def verificaUD(input_file=None, output_file=None, chosen_mode=None):
	if (input_file) and (output_file) and (chosen_mode):
	io_files = [output_file, input_file]
	mode = chosen_mode
	elif (len(sys.argv) == 1):
	io_files = ["report.txt", "corpus.conllu"]
	mode = "p"
	else:
	io_files, mode = parseOptions(sys.argv)
	if (io_files != None):
	print("Starting verification of", io_files[1])
	outfile = open(io_files[0], "w")
	e, w = doIt(io_files[1], outfile, mode)
	outfile.close()
	print("Arquivo {} salvo com o relatório do arquivo {} contendo {} erros e {} avisos".format(io_files[0],io_files[1], e, w))
	else:
	print("Por favor, execute o programa novamente")