stanza-digphil / scripts /clean_conllu.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
#!/usr/bin/env python3
"""
CoNLL-U Preprocessing Script
This script validates CoNLL-U files and removes sentences that don't conform
to standard CoNLL-U restrictions, such as:
- Multiple roots (more than one token with head=0)
- Invalid dependencies
- Missing required fields
- Malformed tokens
Usage:
python clean_conllu.py input.conllu output.conllu
"""
import sys
import argparse
import re
from collections import defaultdict
class CoNLLUValidator:
"""Validator for CoNLL-U format compliance."""
def __init__(self):
self.errors = []
def validate_sentence(self, sentence_lines, sent_id=None):
"""
Validate a single sentence.
Args:
sentence_lines: List of token lines for the sentence
sent_id: Sentence ID for error reporting
Returns:
bool: True if sentence is valid, False otherwise
"""
self.errors = []
if not sentence_lines:
self.errors.append("Empty sentence")
return False
tokens = []
roots = []
token_ids = set()
for line_num, line in enumerate(sentence_lines, 1):
try:
fields = line.split('\t')
if len(fields) != 10:
self.errors.append(f"Line {line_num}: Expected 10 fields, got {len(fields)}")
continue
token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc = fields
# Skip multiword tokens and empty nodes for dependency validation
if '-' in token_id or '.' in token_id:
continue
try:
token_id_int = int(token_id)
head_int = int(head)
except ValueError:
self.errors.append(f"Line {line_num}: Invalid token ID or head: {token_id}, {head}")
continue
token_ids.add(token_id_int)
# Check for root
if head_int == 0:
roots.append(token_id_int)
tokens.append({
'id': token_id_int,
'form': form,
'lemma': lemma,
'upos': upos,
'head': head_int,
'deprel': deprel,
'line_num': line_num
})
except Exception as e:
self.errors.append(f"Line {line_num}: Error parsing line: {e}")
# Check for exactly one root
if len(roots) == 0:
self.errors.append("No root found (no token with head=0)")
elif len(roots) > 1:
self.errors.append(f"Multiple roots found: tokens {roots} all have head=0")
# Check that all heads point to valid tokens (except root)
for token in tokens:
if token['head'] != 0 and token['head'] not in token_ids:
self.errors.append(f"Token {token['id']} has invalid head {token['head']}")
# Check for cycles (basic check)
if not self._check_no_cycles(tokens):
self.errors.append("Dependency cycle detected")
# Check required fields are not empty (except where allowed)
for token in tokens:
if not token['form'] or token['form'] == '_':
self.errors.append(f"Token {token['id']}: Empty or missing form")
if not token['upos'] or token['upos'] == '_':
self.errors.append(f"Token {token['id']}: Empty or missing UPOS")
if not token['deprel'] or token['deprel'] == '_':
self.errors.append(f"Token {token['id']}: Empty or missing deprel")
return len(self.errors) == 0
def _check_no_cycles(self, tokens):
"""Basic cycle detection in dependency tree."""
token_heads = {t['id']: t['head'] for t in tokens}
for start_token in tokens:
visited = set()
current = start_token['id']
while current != 0 and current in token_heads:
if current in visited:
return False # Cycle detected
visited.add(current)
current = token_heads[current]
return True
def get_errors(self):
"""Get list of validation errors."""
return self.errors
def clean_conllu_file(input_file, output_file, verbose=False):
"""
Clean a CoNLL-U file by removing invalid sentences.
Args:
input_file: Path to input CoNLL-U file
output_file: Path to output cleaned file
verbose: Whether to print detailed information
"""
validator = CoNLLUValidator()
valid_sentences = 0
invalid_sentences = 0
total_sentences = 0
with open(input_file, 'r', encoding='utf-8') as infile, \
open(output_file, 'w', encoding='utf-8') as outfile:
current_sentence = []
current_comments = []
sent_id = None
for line_num, line in enumerate(infile, 1):
line = line.rstrip('\n')
if line.startswith('#'):
current_comments.append(line)
if line.startswith('# sent_id'):
sent_id = line.split('=', 1)[1].strip() if '=' in line else str(total_sentences + 1)
elif line.strip() == '':
# End of sentence
if current_sentence:
total_sentences += 1
if validator.validate_sentence(current_sentence, sent_id):
# Write valid sentence
for comment in current_comments:
outfile.write(comment + '\n')
for token_line in current_sentence:
outfile.write(token_line + '\n')
outfile.write('\n')
valid_sentences += 1
else:
invalid_sentences += 1
if verbose:
print(f"Removed sentence {sent_id} (line ~{line_num}): {'; '.join(validator.get_errors())}")
current_sentence = []
current_comments = []
sent_id = None
else:
# Token line
current_sentence.append(line)
# Handle last sentence if file doesn't end with empty line
if current_sentence:
total_sentences += 1
if validator.validate_sentence(current_sentence, sent_id):
for comment in current_comments:
outfile.write(comment + '\n')
for token_line in current_sentence:
outfile.write(token_line + '\n')
outfile.write('\n')
valid_sentences += 1
else:
invalid_sentences += 1
if verbose:
print(f"Removed sentence {sent_id}: {'; '.join(validator.get_errors())}")
print(f"Processing complete:")
print(f" Total sentences: {total_sentences}")
print(f" Valid sentences: {valid_sentences}")
print(f" Invalid sentences removed: {invalid_sentences}")
print(f" Success rate: {valid_sentences/total_sentences*100:.1f}%" if total_sentences > 0 else " Success rate: 0%")
def main():
parser = argparse.ArgumentParser(description='Clean CoNLL-U files by removing invalid sentences')
parser.add_argument('input_file', help='Input CoNLL-U file')
parser.add_argument('output_file', help='Output cleaned CoNLL-U file')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
args = parser.parse_args()
try:
clean_conllu_file(args.input_file, args.output_file, args.verbose)
except FileNotFoundError:
print(f"Error: Input file '{args.input_file}' not found")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == '__main__':
main()