| def fix_empty_fields(input_file, output_file): | |
| with open(input_file, 'r', encoding='utf-8') as f_in: | |
| with open(output_file, 'w', encoding='utf-8') as f_out: | |
| for line in f_in: | |
| if not line.strip() or line.startswith('#'): | |
| f_out.write(line) | |
| continue | |
| fields = line.strip().split('\t') | |
| if len(fields) >= 10: | |
| # some ufeats in punctuation are an empty string instead of "_", stanza cannot deal with this | |
| for i in range(len(fields)): | |
| if fields[i] == "": | |
| fields[i] = "_" | |
| line = '\t'.join(fields) + '\n' | |
| f_out.write(line) | |
| print(f"Fixed {input_file} → {output_file}") | |
| fix_empty_fields("../la_giuseppe/train.conllu", "../la_giuseppe/train_fixed.conllu") | |
| fix_empty_fields("../la_giuseppe/dev.conllu", "../la_giuseppe/dev_fixed.conllu") | |
| fix_empty_fields("../la_giuseppe/test.conllu", "../la_giuseppe/test_fixed.conllu") | |