File size: 1,092 Bytes
c34decd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def fix_empty_fields(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f_in:
        with open(output_file, 'w', encoding='utf-8') as f_out:
            for line in f_in:
                if not line.strip() or line.startswith('#'):
                    f_out.write(line)
                    continue
                    
                fields = line.strip().split('\t')
                if len(fields) >= 10: 
                    # some ufeats in punctuation are an empty string instead of "_", stanza cannot deal with this
                    for i in range(len(fields)):
                        if fields[i] == "":
                            fields[i] = "_"
                    line = '\t'.join(fields) + '\n'
                
                f_out.write(line)
    print(f"Fixed {input_file}{output_file}")

fix_empty_fields("../la_giuseppe/train.conllu", "../la_giuseppe/train_fixed.conllu")
fix_empty_fields("../la_giuseppe/dev.conllu", "../la_giuseppe/dev_fixed.conllu")
fix_empty_fields("../la_giuseppe/test.conllu", "../la_giuseppe/test_fixed.conllu")