Spaces:
Sleeping
Sleeping
| # VerificaUD - Verificador de arquivos CoNLL-U para textos em Português | |
| # | |
| # Este programa recebe um arquivo CoNLL-U que contém um corpus | |
| # em Português e gera um relatório dos problemas encontrados de | |
| # acordo com regras de verificação definidas: | |
| # https://sol.sbc.org.br/index.php/stil/article/view/25485 | |
| # | |
| # Parâmetros obrigatórios: | |
| # -o <report.txt> --output <report.txt> (arquivo onde o relatório será salvo) | |
| # <corpus.conllu> (arquivo a ser verificado) | |
| # | |
| # Opções: | |
| # -h --help help | |
| # -s --struct só verificação estrutural (struct) | |
| # -t --tagger só verificação estrutural e morfológica (tagger) | |
| # -p --parser verificação estrutural, morfológica e sintática (parser) - default | |
| # | |
| # Exemplo de utilização: | |
| # | |
| # verificaUD -o report.txt corpus.conllu | |
| # | |
| # Lê o arquivo corpus.conllu e salva o resultado da verificação | |
| # no arquivo 'report.txt' | |
| # | |
| # last edit: 09/03/2025 | |
| # created by Lucelene Lopes - lucelene@gmail.com | |
| import sys, os | |
| sys.path.insert(0, 'src/packages') | |
| from conlluF import conlluFile | |
| from tagger import checkTerm, taggerDef | |
| from parser import checkSent, parserDef | |
| from structure import goodNumbers, wellformedTree, structDef | |
| from reporter import conlluReports | |
| ################################################# | |
| ### Captura de argumentos da linha de comando | |
| ################################################# | |
| def parseOptions(arguments): | |
| # default options | |
| output_file, input_file, mode = "report.conllu", "corpus.conllu", "-" | |
| i = 1 | |
| while i < len(arguments): | |
| if (arguments[i][0] == "-"): | |
| # ajuda (help) - mostra ajuda, nada é executado | |
| if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-help"): | |
| print("Opções:\n-h ajuda", | |
| "Exemplo de utilização:", \ | |
| "verificaUD -o report.conllu corpus.conllu", \ | |
| "Lê o arquivo corpus.conllu e salva o resultado da verificação", \ | |
| "no arquivo 'report.txt'", \ | |
| sep="\n") | |
| return None | |
| # opção de arquivo de saída (um nome de arquivo) | |
| elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "--output"): | |
| output_file = arguments[i+1] | |
| i += 2 | |
| elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "--struct"): | |
| if (mode == "-"): | |
| mode = "s" | |
| i += 1 | |
| elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "--tagger"): | |
| if (mode == "-") or (mode == "s"): | |
| mode = "t" | |
| i += 1 | |
| elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "--parser"): | |
| if (mode == "-") or (mode == "s") or (mode == "t"): | |
| mode = "p" | |
| i += 1 | |
| # opções inválidas - nada é executado | |
| else: | |
| print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i])) | |
| return None | |
| # arquivo de entrada - só é incluído se existir | |
| else: | |
| if (os.path.isfile(arguments[i])): | |
| input_file = arguments[i] | |
| i += 1 | |
| else: | |
| print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i])) | |
| return None | |
| if (mode == "-"): | |
| mode = "p" | |
| return [output_file, input_file], mode | |
| ######################################## | |
| # Nosso verificador STRUCT | |
| # input: | |
| # name - the name of the conllu file | |
| # report - the name of the textual output file | |
| ######################################## | |
| def struct(name, report, samples): | |
| dump = open(report, "w") | |
| base = conlluFile(name) | |
| s, t = base.getSandT() | |
| print("Base:", name, "- sentenças:", s, "- tokens:", t, "- STRUCT relat:", report) | |
| totalS, totalT = 0, 0 | |
| for i in range(base.getS()): | |
| b = base.getSentByIndex(i) | |
| acc = goodNumbers(b, dump, samples) | |
| if (acc == 0): | |
| acc = wellformedTree(b, dump, samples) | |
| totalT += acc | |
| if (acc > 0): | |
| totalS += 1 | |
| dump.close() | |
| print("Problemas da Estrutura de CoNLL-U (struct):", totalT) | |
| print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) | |
| print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) | |
| return s, t, totalT, 0 | |
| ######################################## | |
| # Nosso verificador TAGGER | |
| # input: | |
| # name - the name of the conllu file | |
| # report - the name of the textual output file | |
| ######################################## | |
| def tagger(name, report, samples): | |
| dump = open(report, "w") | |
| base = conlluFile(name) | |
| s, t = base.getSandT() | |
| print("Base:", name, "- sentenças:", s, "- tokens:", t, "- TAGGER relat:", report) | |
| totalS, totalT = 0, 0 | |
| for i in range(base.getS()): | |
| b = base.getSentByIndex(i) | |
| acc = 0 | |
| for tk in b[4]: | |
| acc += checkTerm(tk, b[0], dump, samples) | |
| totalT += acc | |
| if (acc > 0): | |
| totalS += 1 | |
| dump.close() | |
| print("Problemas de Formação Lexical (tagger):", totalT) | |
| print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) | |
| print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) | |
| return s, t, totalT, 0 | |
| ######################################## | |
| # Nosso verificador PARSER | |
| # input: | |
| # name - the name of the conllu file | |
| # report - the name of the textual output file | |
| ######################################## | |
| def parser(name, report, samples): | |
| dump = open(report, "w") | |
| base = conlluFile(name) | |
| s, t = base.getSandT() | |
| print("Base:", name, "- sentenças:", s, "- tokens:", t, "- PARSER relat:", report) | |
| totalS, totalT, totalW = 0, 0, 0 | |
| for i in range(base.getS()): | |
| b = base.getSentByIndex(i) | |
| rules, acc, warn = checkSent(b, dump, samples) | |
| totalT += acc | |
| totalW += warn | |
| if (acc+warn > 0): | |
| totalS += 1 | |
| #### | |
| dump.close() | |
| print("Problemas de Dependência Relacional (parser):", totalT) | |
| print("Avisos de verificação de Dependência Relacional (parser):", totalW) | |
| print("Sentenças sem problems ou avisos {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s)) | |
| print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t)) | |
| print("Tokens sem problems ou avisos {} ({:2.2f}%)".format(t-(totalT+totalW), 100*(t-(totalT+totalW))/t)) | |
| return s, t, totalT, totalW | |
| ######################################## | |
| # mergeReport | |
| ######################################## | |
| def mergeReport(name, corpus, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p): | |
| issues = [] | |
| probS = [] | |
| avisoS = [] | |
| infile = open(name+"_struct.txt") | |
| for line in infile: | |
| issues.append(line[:-1]) | |
| buf = line.split("\t") | |
| if (buf[0] not in probS): | |
| probS.append(buf[0]) | |
| infile.close() | |
| infile = open(name+"_tagger.txt") | |
| for line in infile: | |
| issues.append(line[:-1]) | |
| buf = line.split("\t") | |
| if (buf[0] not in probS): | |
| probS.append(buf[0]) | |
| infile.close() | |
| infile = open(name+"_parser.txt") | |
| for line in infile: | |
| issues.append(line[:-1]) | |
| if ("Normalmente" not in line): | |
| buf = line.split("\t") | |
| if (buf[0] not in probS): | |
| probS.append(buf[0]) | |
| else: | |
| buf = line.split("\t") | |
| if (buf[0] not in avisoS): | |
| avisoS.append(buf[0]) | |
| infile.close() | |
| issues.sort() | |
| print("Arquivo:", corpus+"\n", file=outfile) | |
| print("sentencas: {:5>} (tokens: {})".format(s,t), file=outfile) | |
| print("sentencas sem erro: {:5>}".format(s-len(probS)), file=outfile) | |
| print("sentencas sem erro ou aviso: {:5>}".format(s-len(probS)-len(avisoS)), file=outfile) | |
| print("erros estruturais: {:5>} - avisos: {:5>}".format(e_s, w_s), file=outfile) | |
| print("erros lexicais: {:5>} - avisos: {:5>}".format(e_t, w_t), file=outfile) | |
| print("erros sintáticos: {:5>} - avisos: {:5>}".format(e_p, w_p), file=outfile) | |
| print("\nErros e avisos encontrados:", file=outfile) | |
| for i in issues: | |
| buf = i.split("\t") | |
| print("{:15}\ttoken {:3}\t{}".format(buf[0], buf[1], buf[2]), file=outfile) | |
| ######################################## | |
| # Do it all | |
| ######################################## | |
| def doIt(name, outfile, mode): | |
| if (name[-7:] != ".conllu"): | |
| outputfile = name | |
| else: | |
| outputfile = name[:-7] | |
| samples = conlluReports("samples", [structDef,taggerDef,parserDef]) | |
| if (mode in ["s", "t", "p"]): | |
| print("Executando verificação estrutural...") | |
| s, t, e_s, w_s = struct(name, outputfile+"_struct.txt", samples) | |
| else: | |
| s, t, e_s, w_s = 0,0,0,0 | |
| if (mode in ["t", "p"]): | |
| print("Executando verificação de tagger...") | |
| s, t, e_t, w_t = tagger(name, outputfile+"_tagger.txt", samples) | |
| else: | |
| s, t, e_t, w_t = 0,0,0,0 | |
| if (mode in ["p"]): | |
| print("Executando verificação de parser...") | |
| s, t, e_p, w_p = parser(name, outputfile+"_parser.txt", samples) | |
| else: | |
| s, t, e_p, w_p = 0,0,0,0 | |
| samples.closeAll() | |
| mergeReport(outputfile, name, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p) | |
| return e_s+e_t+e_p, w_s+w_t+w_p | |
| ######################################## | |
| # Main function - verificaUD | |
| ######################################## | |
| def verificaUD(input_file=None, output_file=None, chosen_mode=None): | |
| if (input_file) and (output_file) and (chosen_mode): | |
| io_files = [output_file, input_file] | |
| mode = chosen_mode | |
| elif (len(sys.argv) == 1): | |
| io_files = ["report.txt", "corpus.conllu"] | |
| mode = "p" | |
| else: | |
| io_files, mode = parseOptions(sys.argv) | |
| if (io_files != None): | |
| print("Starting verification of", io_files[1]) | |
| outfile = open(io_files[0], "w") | |
| e, w = doIt(io_files[1], outfile, mode) | |
| outfile.close() | |
| print("Arquivo {} salvo com o relatório do arquivo {} contendo {} erros e {} avisos".format(io_files[0],io_files[1], e, w)) | |
| else: | |
| print("Por favor, execute o programa novamente") | |