File size: 1,092 Bytes
c34decd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | def fix_empty_fields(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as f_in:
with open(output_file, 'w', encoding='utf-8') as f_out:
for line in f_in:
if not line.strip() or line.startswith('#'):
f_out.write(line)
continue
fields = line.strip().split('\t')
if len(fields) >= 10:
# some ufeats in punctuation are an empty string instead of "_", stanza cannot deal with this
for i in range(len(fields)):
if fields[i] == "":
fields[i] = "_"
line = '\t'.join(fields) + '\n'
f_out.write(line)
print(f"Fixed {input_file} → {output_file}")
fix_empty_fields("../la_giuseppe/train.conllu", "../la_giuseppe/train_fixed.conllu")
fix_empty_fields("../la_giuseppe/dev.conllu", "../la_giuseppe/dev_fixed.conllu")
fix_empty_fields("../la_giuseppe/test.conllu", "../la_giuseppe/test_fixed.conllu")
|