File size: 8,361 Bytes
19b8775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env python3
"""
CoNLL-U Preprocessing Script

This script validates CoNLL-U files and removes sentences that don't conform
to standard CoNLL-U restrictions, such as:
- Multiple roots (more than one token with head=0)
- Invalid dependencies
- Missing required fields
- Malformed tokens

Usage:
    python clean_conllu.py input.conllu output.conllu
"""

import sys
import argparse
import re
from collections import defaultdict


class CoNLLUValidator:
    """Validator for CoNLL-U format compliance."""
    
    def __init__(self):
        self.errors = []
    
    def validate_sentence(self, sentence_lines, sent_id=None):
        """
        Validate a single sentence.
        
        Args:
            sentence_lines: List of token lines for the sentence
            sent_id: Sentence ID for error reporting
            
        Returns:
            bool: True if sentence is valid, False otherwise
        """
        self.errors = []
        
        if not sentence_lines:
            self.errors.append("Empty sentence")
            return False
        
        tokens = []
        roots = []
        token_ids = set()
        
        for line_num, line in enumerate(sentence_lines, 1):
            try:
                fields = line.split('\t')
                if len(fields) != 10:
                    self.errors.append(f"Line {line_num}: Expected 10 fields, got {len(fields)}")
                    continue
                
                token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc = fields
                
                # Skip multiword tokens and empty nodes for dependency validation
                if '-' in token_id or '.' in token_id:
                    continue
                
                try:
                    token_id_int = int(token_id)
                    head_int = int(head)
                except ValueError:
                    self.errors.append(f"Line {line_num}: Invalid token ID or head: {token_id}, {head}")
                    continue
                
                token_ids.add(token_id_int)
                
                # Check for root
                if head_int == 0:
                    roots.append(token_id_int)
                
                tokens.append({
                    'id': token_id_int,
                    'form': form,
                    'lemma': lemma,
                    'upos': upos,
                    'head': head_int,
                    'deprel': deprel,
                    'line_num': line_num
                })
                
            except Exception as e:
                self.errors.append(f"Line {line_num}: Error parsing line: {e}")
        
        # Check for exactly one root
        if len(roots) == 0:
            self.errors.append("No root found (no token with head=0)")
        elif len(roots) > 1:
            self.errors.append(f"Multiple roots found: tokens {roots} all have head=0")
        
        # Check that all heads point to valid tokens (except root)
        for token in tokens:
            if token['head'] != 0 and token['head'] not in token_ids:
                self.errors.append(f"Token {token['id']} has invalid head {token['head']}")
        
        # Check for cycles (basic check)
        if not self._check_no_cycles(tokens):
            self.errors.append("Dependency cycle detected")
        
        # Check required fields are not empty (except where allowed)
        for token in tokens:
            if not token['form'] or token['form'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing form")
            if not token['upos'] or token['upos'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing UPOS")
            if not token['deprel'] or token['deprel'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing deprel")
        
        return len(self.errors) == 0
    
    def _check_no_cycles(self, tokens):
        """Basic cycle detection in dependency tree."""
        token_heads = {t['id']: t['head'] for t in tokens}
        
        for start_token in tokens:
            visited = set()
            current = start_token['id']
            
            while current != 0 and current in token_heads:
                if current in visited:
                    return False  # Cycle detected
                visited.add(current)
                current = token_heads[current]
        
        return True
    
    def get_errors(self):
        """Get list of validation errors."""
        return self.errors


def clean_conllu_file(input_file, output_file, verbose=False):
    """
    Clean a CoNLL-U file by removing invalid sentences.
    
    Args:
        input_file: Path to input CoNLL-U file
        output_file: Path to output cleaned file
        verbose: Whether to print detailed information
    """
    validator = CoNLLUValidator()
    
    valid_sentences = 0
    invalid_sentences = 0
    total_sentences = 0
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        current_sentence = []
        current_comments = []
        sent_id = None
        
        for line_num, line in enumerate(infile, 1):
            line = line.rstrip('\n')
            
            if line.startswith('#'):
                current_comments.append(line)
                if line.startswith('# sent_id'):
                    sent_id = line.split('=', 1)[1].strip() if '=' in line else str(total_sentences + 1)
            elif line.strip() == '':
                # End of sentence
                if current_sentence:
                    total_sentences += 1
                    
                    if validator.validate_sentence(current_sentence, sent_id):
                        # Write valid sentence
                        for comment in current_comments:
                            outfile.write(comment + '\n')
                        for token_line in current_sentence:
                            outfile.write(token_line + '\n')
                        outfile.write('\n')
                        valid_sentences += 1
                    else:
                        invalid_sentences += 1
                        if verbose:
                            print(f"Removed sentence {sent_id} (line ~{line_num}): {'; '.join(validator.get_errors())}")
                    
                    current_sentence = []
                    current_comments = []
                    sent_id = None
            else:
                # Token line
                current_sentence.append(line)
        
        # Handle last sentence if file doesn't end with empty line
        if current_sentence:
            total_sentences += 1
            
            if validator.validate_sentence(current_sentence, sent_id):
                for comment in current_comments:
                    outfile.write(comment + '\n')
                for token_line in current_sentence:
                    outfile.write(token_line + '\n')
                outfile.write('\n')
                valid_sentences += 1
            else:
                invalid_sentences += 1
                if verbose:
                    print(f"Removed sentence {sent_id}: {'; '.join(validator.get_errors())}")
    
    print(f"Processing complete:")
    print(f"  Total sentences: {total_sentences}")
    print(f"  Valid sentences: {valid_sentences}")
    print(f"  Invalid sentences removed: {invalid_sentences}")
    print(f"  Success rate: {valid_sentences/total_sentences*100:.1f}%" if total_sentences > 0 else "  Success rate: 0%")


def main():
    parser = argparse.ArgumentParser(description='Clean CoNLL-U files by removing invalid sentences')
    parser.add_argument('input_file', help='Input CoNLL-U file')
    parser.add_argument('output_file', help='Output cleaned CoNLL-U file')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
    
    args = parser.parse_args()
    
    try:
        clean_conllu_file(args.input_file, args.output_file, args.verbose)
    except FileNotFoundError:
        print(f"Error: Input file '{args.input_file}' not found")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()