# VerificaUD - Verificador de arquivos CoNLL-U para textos em Português # # Este programa recebe um arquivo CoNLL-U que contém um corpus # em Português e gera um relatório dos problemas encontrados de # acordo com regras de verificação definidas: # https://sol.sbc.org.br/index.php/stil/article/view/25485 # # Parâmetros obrigatórios: # -o --output (arquivo onde o relatório será salvo) # (arquivo a ser verificado) # # Opções: # -h --help help # -s --struct só verificação estrutural (struct) # -t --tagger só verificação estrutural e morfológica (tagger) # -p --parser verificação estrutural, morfológica e sintática (parser) - default # # Exemplo de utilização: # # verificaUD -o report.txt corpus.conllu # # Lê o arquivo corpus.conllu e salva o resultado da verificação # no arquivo 'report.txt' # # last edit: 09/03/2025 # created by Lucelene Lopes - lucelene@gmail.com import sys, os sys.path.insert(0, 'src/packages') from conlluF import conlluFile from tagger import checkTerm, taggerDef from parser import checkSent, parserDef from structure import goodNumbers, wellformedTree, structDef from reporter import conlluReports ################################################# ### Captura de argumentos da linha de comando ################################################# def parseOptions(arguments): # default options output_file, input_file, mode = "report.conllu", "corpus.conllu", "-" i = 1 while i < len(arguments): if (arguments[i][0] == "-"): # ajuda (help) - mostra ajuda, nada é executado if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ (arguments[i] == "-help"): print("Opções:\n-h ajuda", "Exemplo de utilização:", \ "verificaUD -o report.conllu corpus.conllu", \ "Lê o arquivo corpus.conllu e salva o resultado da verificação", \ "no arquivo 'report.txt'", \ sep="\n") return None # opção de arquivo de saída (um nome de arquivo) elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ (arguments[i] == "--output"): output_file = arguments[i+1] i += 2 elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \ (arguments[i] == "--struct"): if (mode == "-"): mode = "s" i += 1 elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \ (arguments[i] == "--tagger"): if (mode == "-") or (mode == "s"): mode = "t" i += 1 elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \ (arguments[i] == "--parser"): if (mode == "-") or (mode == "s") or (mode == "t"): mode = "p" i += 1 # opções inválidas - nada é executado else: print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i])) return None # arquivo de entrada - só é incluído se existir else: if (os.path.isfile(arguments[i])): input_file = arguments[i] i += 1 else: print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i])) return None if (mode == "-"): mode = "p" return [output_file, input_file], mode ######################################## # Nosso verificador STRUCT # input: # name - the name of the conllu file # report - the name of the textual output file ######################################## def struct(name, report, samples): dump = open(report, "w") base = conlluFile(name) s, t = base.getSandT() print("Base:", name, "- sentenças:", s, "- tokens:", t, "- STRUCT relat:", report) totalS, totalT = 0, 0 for i in range(base.getS()): b = base.getSentByIndex(i) acc = goodNumbers(b, dump, samples) if (acc == 0): acc = wellformedTree(b, dump, samples) totalT += acc if (acc > 0): totalS += 1 dump.close() print("Problemas da Estrutura de CoNLL-U (struct):", totalT) print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) return s, t, totalT, 0 ######################################## # Nosso verificador TAGGER # input: # name - the name of the conllu file # report - the name of the textual output file ######################################## def tagger(name, report, samples): dump = open(report, "w") base = conlluFile(name) s, t = base.getSandT() print("Base:", name, "- sentenças:", s, "- tokens:", t, "- TAGGER relat:", report) totalS, totalT = 0, 0 for i in range(base.getS()): b = base.getSentByIndex(i) acc = 0 for tk in b[4]: acc += checkTerm(tk, b[0], dump, samples) totalT += acc if (acc > 0): totalS += 1 dump.close() print("Problemas de Formação Lexical (tagger):", totalT) print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) return s, t, totalT, 0 ######################################## # Nosso verificador PARSER # input: # name - the name of the conllu file # report - the name of the textual output file ######################################## def parser(name, report, samples): dump = open(report, "w") base = conlluFile(name) s, t = base.getSandT() print("Base:", name, "- sentenças:", s, "- tokens:", t, "- PARSER relat:", report) totalS, totalT, totalW = 0, 0, 0 for i in range(base.getS()): b = base.getSentByIndex(i) rules, acc, warn = checkSent(b, dump, samples) totalT += acc totalW += warn if (acc+warn > 0): totalS += 1 #### dump.close() print("Problemas de Dependência Relacional (parser):", totalT) print("Avisos de verificação de Dependência Relacional (parser):", totalW) print("Sentenças sem problems ou avisos {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) print("Tokens sem problems ou avisos {} ({:2.2f}%)".format(t-(totalT+totalW), 100*(t-(totalT+totalW))/t)) return s, t, totalT, totalW ######################################## # mergeReport ######################################## def mergeReport(name, corpus, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p): issues = [] probS = [] avisoS = [] infile = open(name+"_struct.txt") for line in infile: issues.append(line[:-1]) buf = line.split("\t") if (buf[0] not in probS): probS.append(buf[0]) infile.close() infile = open(name+"_tagger.txt") for line in infile: issues.append(line[:-1]) buf = line.split("\t") if (buf[0] not in probS): probS.append(buf[0]) infile.close() infile = open(name+"_parser.txt") for line in infile: issues.append(line[:-1]) if ("Normalmente" not in line): buf = line.split("\t") if (buf[0] not in probS): probS.append(buf[0]) else: buf = line.split("\t") if (buf[0] not in avisoS): avisoS.append(buf[0]) infile.close() issues.sort() print("Arquivo:", corpus+"\n", file=outfile) print("sentencas: {:5>} (tokens: {})".format(s,t), file=outfile) print("sentencas sem erro: {:5>}".format(s-len(probS)), file=outfile) print("sentencas sem erro ou aviso: {:5>}".format(s-len(probS)-len(avisoS)), file=outfile) print("erros estruturais: {:5>} - avisos: {:5>}".format(e_s, w_s), file=outfile) print("erros lexicais: {:5>} - avisos: {:5>}".format(e_t, w_t), file=outfile) print("erros sintáticos: {:5>} - avisos: {:5>}".format(e_p, w_p), file=outfile) print("\nErros e avisos encontrados:", file=outfile) for i in issues: buf = i.split("\t") print("{:15}\ttoken {:3}\t{}".format(buf[0], buf[1], buf[2]), file=outfile) ######################################## # Do it all ######################################## def doIt(name, outfile, mode): if (name[-7:] != ".conllu"): outputfile = name else: outputfile = name[:-7] samples = conlluReports("samples", [structDef,taggerDef,parserDef]) if (mode in ["s", "t", "p"]): print("Executando verificação estrutural...") s, t, e_s, w_s = struct(name, outputfile+"_struct.txt", samples) else: s, t, e_s, w_s = 0,0,0,0 if (mode in ["t", "p"]): print("Executando verificação de tagger...") s, t, e_t, w_t = tagger(name, outputfile+"_tagger.txt", samples) else: s, t, e_t, w_t = 0,0,0,0 if (mode in ["p"]): print("Executando verificação de parser...") s, t, e_p, w_p = parser(name, outputfile+"_parser.txt", samples) else: s, t, e_p, w_p = 0,0,0,0 samples.closeAll() mergeReport(outputfile, name, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p) return e_s+e_t+e_p, w_s+w_t+w_p ######################################## # Main function - verificaUD ######################################## def verificaUD(input_file=None, output_file=None, chosen_mode=None): if (input_file) and (output_file) and (chosen_mode): io_files = [output_file, input_file] mode = chosen_mode elif (len(sys.argv) == 1): io_files = ["report.txt", "corpus.conllu"] mode = "p" else: io_files, mode = parseOptions(sys.argv) if (io_files != None): print("Starting verification of", io_files[1]) outfile = open(io_files[0], "w") e, w = doIt(io_files[1], outfile, mode) outfile.close() print("Arquivo {} salvo com o relatório do arquivo {} contendo {} erros e {} avisos".format(io_files[0],io_files[1], e, w)) else: print("Por favor, execute o programa novamente")