Spaces:

NILC-ICMC-USP
/

VerificaUD

Sleeping

File size: 10,574 Bytes

# VerificaUD - Verificador de arquivos CoNLL-U para textos em Português
#
# Este programa recebe um arquivo CoNLL-U que contém um corpus
#   em Português e gera um relatório dos problemas encontrados de
#   acordo com regras de verificação definidas:
#         https://sol.sbc.org.br/index.php/stil/article/view/25485
#
# Parâmetros obrigatórios:
# -o <report.txt> --output <report.txt>   (arquivo onde o relatório será salvo)
# <corpus.conllu>    (arquivo a ser verificado)
#
# Opções:
# -h --help help
# -s --struct só verificação estrutural (struct)
# -t --tagger só verificação estrutural e morfológica (tagger)
# -p --parser verificação estrutural, morfológica e sintática (parser) - default
# 
# Exemplo de utilização:
#
# verificaUD -o report.txt corpus.conllu
#
# Lê o arquivo corpus.conllu e salva o resultado da verificação
#   no arquivo 'report.txt'
#
# last edit: 09/03/2025
# created by Lucelene Lopes - lucelene@gmail.com

import sys, os
sys.path.insert(0, 'src/packages')
from conlluF import conlluFile
from tagger import checkTerm, taggerDef
from parser import checkSent, parserDef
from structure import goodNumbers, wellformedTree, structDef
from reporter import conlluReports

#################################################
### Captura de argumentos da linha de comando
#################################################
def parseOptions(arguments):
    # default options
    output_file, input_file, mode = "report.conllu", "corpus.conllu", "-"
    i = 1
    while i < len(arguments):
        if (arguments[i][0] == "-"):
            # ajuda (help) - mostra ajuda, nada é executado
            if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
               (arguments[i] == "-help"):
                print("Opções:\n-h ajuda",
                      "Exemplo de utilização:", \
                      "verificaUD -o report.conllu corpus.conllu", \
                      "Lê o arquivo corpus.conllu e salva o resultado da verificação", \
                      "no arquivo 'report.txt'", \
                      sep="\n")
                return None
            # opção de arquivo de saída (um nome de arquivo)
            elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
                 (arguments[i] == "--output"):
                output_file = arguments[i+1]
                i += 2
            elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \
                 (arguments[i] == "--struct"):
                if (mode == "-"):
                    mode = "s"
                i += 1
            elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \
                 (arguments[i] == "--tagger"):
                if (mode == "-") or (mode == "s"):
                    mode = "t"
                i += 1
            elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \
                 (arguments[i] == "--parser"):
                if (mode == "-") or (mode == "s") or (mode == "t"):
                    mode = "p"
                i += 1
            # opções inválidas - nada é executado
            else:
                print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i]))
                return None
        # arquivo de entrada - só é incluído se existir
        else:
            if (os.path.isfile(arguments[i])):
                input_file = arguments[i]
                i += 1
            else:
                print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
                return None
    if (mode == "-"):
        mode = "p"
    return [output_file, input_file], mode

########################################
#  Nosso verificador STRUCT
#  input:
#     name     -  the name of the conllu file
#     report   -  the name of the textual output file
########################################
def struct(name, report, samples):
    dump = open(report, "w")
    base = conlluFile(name)
    s, t = base.getSandT()
    print("Base:", name, "- sentenças:", s, "- tokens:", t, "- STRUCT relat:", report)
    totalS, totalT = 0, 0
    for i in range(base.getS()):
        b = base.getSentByIndex(i)
        acc = goodNumbers(b, dump, samples)
        if (acc == 0):
            acc = wellformedTree(b, dump, samples)
        totalT += acc
        if (acc > 0):
            totalS += 1
    dump.close()
    print("Problemas da Estrutura de CoNLL-U (struct):", totalT)
    print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
    print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
    return s, t, totalT, 0

########################################
#  Nosso verificador TAGGER
#  input:
#     name     -  the name of the conllu file
#     report   -  the name of the textual output file
########################################
def tagger(name, report, samples):
    dump = open(report, "w")
    base = conlluFile(name)
    s, t = base.getSandT()
    print("Base:", name, "- sentenças:", s, "- tokens:", t, "- TAGGER relat:", report)
    totalS, totalT = 0, 0
    for i in range(base.getS()):
        b = base.getSentByIndex(i)
        acc = 0
        for tk in b[4]:
            acc += checkTerm(tk, b[0], dump, samples)
        totalT += acc
        if (acc > 0):
            totalS += 1
    dump.close()
    print("Problemas de Formação Lexical (tagger):", totalT)
    print("Sentenças sem problems {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
    print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
    return s, t, totalT, 0

########################################
#  Nosso verificador PARSER
#  input:
#     name     -  the name of the conllu file
#     report   -  the name of the textual output file
########################################
def parser(name, report, samples):
    dump = open(report, "w")
    base = conlluFile(name)
    s, t = base.getSandT()
    print("Base:", name, "- sentenças:", s, "- tokens:", t, "- PARSER relat:", report)
    totalS, totalT, totalW = 0, 0, 0
    for i in range(base.getS()):
        b = base.getSentByIndex(i)
        rules, acc, warn = checkSent(b, dump, samples)
        totalT += acc
        totalW += warn
        if (acc+warn > 0):
            totalS += 1
            ####
            
    dump.close()
    print("Problemas de Dependência Relacional (parser):", totalT)
    print("Avisos de verificação de Dependência Relacional (parser):", totalW)
    print("Sentenças sem problems ou avisos {} ({:2.2f}%)".format(s-totalS, 100*(s-totalS)/s))
    print("Tokens sem problems {} ({:2.2f}%)".format(t-totalT, 100*(t-totalT)/t))
    print("Tokens sem problems ou avisos {} ({:2.2f}%)".format(t-(totalT+totalW), 100*(t-(totalT+totalW))/t))
    return s, t, totalT, totalW

########################################
#  mergeReport
########################################
def mergeReport(name, corpus, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p):
    issues = []
    probS = []
    avisoS = []
    infile = open(name+"_struct.txt")
    for line in infile:
        issues.append(line[:-1])
        buf = line.split("\t")
        if (buf[0] not in probS):
            probS.append(buf[0])
    infile.close()
    infile = open(name+"_tagger.txt")
    for line in infile:
        issues.append(line[:-1])
        buf = line.split("\t")
        if (buf[0] not in probS):
            probS.append(buf[0])
    infile.close()
    infile = open(name+"_parser.txt")
    for line in infile:
        issues.append(line[:-1])
        if ("Normalmente" not in line):
            buf = line.split("\t")
            if (buf[0] not in probS):
                probS.append(buf[0])
        else:
            buf = line.split("\t")
            if (buf[0] not in avisoS):
                avisoS.append(buf[0])
    infile.close()
    issues.sort()
    print("Arquivo:", corpus+"\n", file=outfile)
    print("sentencas:                   {:5>} (tokens: {})".format(s,t), file=outfile)
    print("sentencas sem erro:          {:5>}".format(s-len(probS)), file=outfile)
    print("sentencas sem erro ou aviso: {:5>}".format(s-len(probS)-len(avisoS)), file=outfile)
    print("erros estruturais: {:5>} - avisos: {:5>}".format(e_s, w_s), file=outfile)
    print("erros lexicais:    {:5>} - avisos: {:5>}".format(e_t, w_t), file=outfile)
    print("erros sintáticos:  {:5>} - avisos: {:5>}".format(e_p, w_p), file=outfile)
    
    print("\nErros e avisos encontrados:", file=outfile)
    for i in issues:
        buf = i.split("\t")
        print("{:15}\ttoken {:3}\t{}".format(buf[0], buf[1], buf[2]), file=outfile)

########################################
#  Do it all
########################################
def doIt(name, outfile, mode):
    if (name[-7:] != ".conllu"):
        outputfile = name
    else:
        outputfile = name[:-7]
    samples = conlluReports("samples", [structDef,taggerDef,parserDef])
    if (mode in ["s", "t", "p"]):
        print("Executando verificação estrutural...")
        s, t, e_s, w_s = struct(name, outputfile+"_struct.txt", samples)
    else:
        s, t, e_s, w_s = 0,0,0,0
    if (mode in ["t", "p"]):
        print("Executando verificação de tagger...")
        s, t, e_t, w_t = tagger(name, outputfile+"_tagger.txt", samples)
    else:
        s, t, e_t, w_t = 0,0,0,0
    if (mode in ["p"]):
        print("Executando verificação de parser...")
        s, t, e_p, w_p = parser(name, outputfile+"_parser.txt", samples)
    else:
        s, t, e_p, w_p = 0,0,0,0
    samples.closeAll()
    mergeReport(outputfile, name, outfile, s, t, e_s, w_s, e_t, w_t, e_p, w_p)
    return e_s+e_t+e_p, w_s+w_t+w_p


########################################
#  Main function - verificaUD
########################################
def verificaUD(input_file=None, output_file=None, chosen_mode=None):
    if (input_file) and (output_file) and (chosen_mode):
        io_files = [output_file, input_file]
        mode = chosen_mode
    elif (len(sys.argv) == 1):
        io_files = ["report.txt", "corpus.conllu"]
        mode = "p"
    else:
        io_files, mode = parseOptions(sys.argv)
    if (io_files != None):
        print("Starting verification of", io_files[1])
        outfile = open(io_files[0], "w")
        e, w = doIt(io_files[1], outfile, mode)
        outfile.close()
        print("Arquivo {} salvo com o relatório do arquivo {} contendo {} erros e {} avisos".format(io_files[0],io_files[1], e, w))
    else:
        print("Por favor, execute o programa novamente")