latin_experiments / stanza /fix_conllu.py
bowphs's picture
Add files using upload-large-folder tool
c34decd verified
def fix_empty_fields(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f_in:
with open(output_file, 'w', encoding='utf-8') as f_out:
for line in f_in:
if not line.strip() or line.startswith('#'):
f_out.write(line)
continue
fields = line.strip().split('\t')
if len(fields) >= 10:
# some ufeats in punctuation are an empty string instead of "_", stanza cannot deal with this
for i in range(len(fields)):
if fields[i] == "":
fields[i] = "_"
line = '\t'.join(fields) + '\n'
f_out.write(line)
print(f"Fixed {input_file}{output_file}")
fix_empty_fields("../la_giuseppe/train.conllu", "../la_giuseppe/train_fixed.conllu")
fix_empty_fields("../la_giuseppe/dev.conllu", "../la_giuseppe/dev_fixed.conllu")
fix_empty_fields("../la_giuseppe/test.conllu", "../la_giuseppe/test_fixed.conllu")