File size: 8,361 Bytes

19b8775

#!/usr/bin/env python3
"""
CoNLL-U Preprocessing Script

This script validates CoNLL-U files and removes sentences that don't conform
to standard CoNLL-U restrictions, such as:
- Multiple roots (more than one token with head=0)
- Invalid dependencies
- Missing required fields
- Malformed tokens

Usage:
    python clean_conllu.py input.conllu output.conllu
"""

import sys
import argparse
import re
from collections import defaultdict


class CoNLLUValidator:
    """Validator for CoNLL-U format compliance."""
    
    def __init__(self):
        self.errors = []
    
    def validate_sentence(self, sentence_lines, sent_id=None):
        """
        Validate a single sentence.
        
        Args:
            sentence_lines: List of token lines for the sentence
            sent_id: Sentence ID for error reporting
            
        Returns:
            bool: True if sentence is valid, False otherwise
        """
        self.errors = []
        
        if not sentence_lines:
            self.errors.append("Empty sentence")
            return False
        
        tokens = []
        roots = []
        token_ids = set()
        
        for line_num, line in enumerate(sentence_lines, 1):
            try:
                fields = line.split('\t')
                if len(fields) != 10:
                    self.errors.append(f"Line {line_num}: Expected 10 fields, got {len(fields)}")
                    continue
                
                token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc = fields
                
                # Skip multiword tokens and empty nodes for dependency validation
                if '-' in token_id or '.' in token_id:
                    continue
                
                try:
                    token_id_int = int(token_id)
                    head_int = int(head)
                except ValueError:
                    self.errors.append(f"Line {line_num}: Invalid token ID or head: {token_id}, {head}")
                    continue
                
                token_ids.add(token_id_int)
                
                # Check for root
                if head_int == 0:
                    roots.append(token_id_int)
                
                tokens.append({
                    'id': token_id_int,
                    'form': form,
                    'lemma': lemma,
                    'upos': upos,
                    'head': head_int,
                    'deprel': deprel,
                    'line_num': line_num
                })
                
            except Exception as e:
                self.errors.append(f"Line {line_num}: Error parsing line: {e}")
        
        # Check for exactly one root
        if len(roots) == 0:
            self.errors.append("No root found (no token with head=0)")
        elif len(roots) > 1:
            self.errors.append(f"Multiple roots found: tokens {roots} all have head=0")
        
        # Check that all heads point to valid tokens (except root)
        for token in tokens:
            if token['head'] != 0 and token['head'] not in token_ids:
                self.errors.append(f"Token {token['id']} has invalid head {token['head']}")
        
        # Check for cycles (basic check)
        if not self._check_no_cycles(tokens):
            self.errors.append("Dependency cycle detected")
        
        # Check required fields are not empty (except where allowed)
        for token in tokens:
            if not token['form'] or token['form'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing form")
            if not token['upos'] or token['upos'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing UPOS")
            if not token['deprel'] or token['deprel'] == '_':
                self.errors.append(f"Token {token['id']}: Empty or missing deprel")
        
        return len(self.errors) == 0
    
    def _check_no_cycles(self, tokens):
        """Basic cycle detection in dependency tree."""
        token_heads = {t['id']: t['head'] for t in tokens}
        
        for start_token in tokens:
            visited = set()
            current = start_token['id']
            
            while current != 0 and current in token_heads:
                if current in visited:
                    return False  # Cycle detected
                visited.add(current)
                current = token_heads[current]
        
        return True
    
    def get_errors(self):
        """Get list of validation errors."""
        return self.errors


def clean_conllu_file(input_file, output_file, verbose=False):
    """
    Clean a CoNLL-U file by removing invalid sentences.
    
    Args:
        input_file: Path to input CoNLL-U file
        output_file: Path to output cleaned file
        verbose: Whether to print detailed information
    """
    validator = CoNLLUValidator()
    
    valid_sentences = 0
    invalid_sentences = 0
    total_sentences = 0
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        current_sentence = []
        current_comments = []
        sent_id = None
        
        for line_num, line in enumerate(infile, 1):
            line = line.rstrip('\n')
            
            if line.startswith('#'):
                current_comments.append(line)
                if line.startswith('# sent_id'):
                    sent_id = line.split('=', 1)[1].strip() if '=' in line else str(total_sentences + 1)
            elif line.strip() == '':
                # End of sentence
                if current_sentence:
                    total_sentences += 1
                    
                    if validator.validate_sentence(current_sentence, sent_id):
                        # Write valid sentence
                        for comment in current_comments:
                            outfile.write(comment + '\n')
                        for token_line in current_sentence:
                            outfile.write(token_line + '\n')
                        outfile.write('\n')
                        valid_sentences += 1
                    else:
                        invalid_sentences += 1
                        if verbose:
                            print(f"Removed sentence {sent_id} (line ~{line_num}): {'; '.join(validator.get_errors())}")
                    
                    current_sentence = []
                    current_comments = []
                    sent_id = None
            else:
                # Token line
                current_sentence.append(line)
        
        # Handle last sentence if file doesn't end with empty line
        if current_sentence:
            total_sentences += 1
            
            if validator.validate_sentence(current_sentence, sent_id):
                for comment in current_comments:
                    outfile.write(comment + '\n')
                for token_line in current_sentence:
                    outfile.write(token_line + '\n')
                outfile.write('\n')
                valid_sentences += 1
            else:
                invalid_sentences += 1
                if verbose:
                    print(f"Removed sentence {sent_id}: {'; '.join(validator.get_errors())}")
    
    print(f"Processing complete:")
    print(f"  Total sentences: {total_sentences}")
    print(f"  Valid sentences: {valid_sentences}")
    print(f"  Invalid sentences removed: {invalid_sentences}")
    print(f"  Success rate: {valid_sentences/total_sentences*100:.1f}%" if total_sentences > 0 else "  Success rate: 0%")


def main():
    parser = argparse.ArgumentParser(description='Clean CoNLL-U files by removing invalid sentences')
    parser.add_argument('input_file', help='Input CoNLL-U file')
    parser.add_argument('output_file', help='Output cleaned CoNLL-U file')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
    
    args = parser.parse_args()
    
    try:
        clean_conllu_file(args.input_file, args.output_file, args.verbose)
    except FileNotFoundError:
        print(f"Error: Input file '{args.input_file}' not found")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == '__main__':
    main()