stanza-digphil / scripts /clean_conllu.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 1 month ago

8.36 kB

	#!/usr/bin/env python3
	"""
	CoNLL-U Preprocessing Script

	This script validates CoNLL-U files and removes sentences that don't conform
	to standard CoNLL-U restrictions, such as:
	- Multiple roots (more than one token with head=0)
	- Invalid dependencies
	- Missing required fields
	- Malformed tokens

	Usage:
	python clean_conllu.py input.conllu output.conllu
	"""

	import sys
	import argparse
	import re
	from collections import defaultdict


	class CoNLLUValidator:
	"""Validator for CoNLL-U format compliance."""

	def __init__(self):
	self.errors = []

	def validate_sentence(self, sentence_lines, sent_id=None):
	"""
	Validate a single sentence.

	Args:
	sentence_lines: List of token lines for the sentence
	sent_id: Sentence ID for error reporting

	Returns:
	bool: True if sentence is valid, False otherwise
	"""
	self.errors = []

	if not sentence_lines:
	self.errors.append("Empty sentence")
	return False

	tokens = []
	roots = []
	token_ids = set()

	for line_num, line in enumerate(sentence_lines, 1):
	try:
	fields = line.split('\t')
	if len(fields) != 10:
	self.errors.append(f"Line {line_num}: Expected 10 fields, got {len(fields)}")
	continue

	token_id, form, lemma, upos, xpos, feats, head, deprel, deps, misc = fields

	# Skip multiword tokens and empty nodes for dependency validation
	if '-' in token_id or '.' in token_id:
	continue

	try:
	token_id_int = int(token_id)
	head_int = int(head)
	except ValueError:
	self.errors.append(f"Line {line_num}: Invalid token ID or head: {token_id}, {head}")
	continue

	token_ids.add(token_id_int)

	# Check for root
	if head_int == 0:
	roots.append(token_id_int)

	tokens.append({
	'id': token_id_int,
	'form': form,
	'lemma': lemma,
	'upos': upos,
	'head': head_int,
	'deprel': deprel,
	'line_num': line_num
	})

	except Exception as e:
	self.errors.append(f"Line {line_num}: Error parsing line: {e}")

	# Check for exactly one root
	if len(roots) == 0:
	self.errors.append("No root found (no token with head=0)")
	elif len(roots) > 1:
	self.errors.append(f"Multiple roots found: tokens {roots} all have head=0")

	# Check that all heads point to valid tokens (except root)
	for token in tokens:
	if token['head'] != 0 and token['head'] not in token_ids:
	self.errors.append(f"Token {token['id']} has invalid head {token['head']}")

	# Check for cycles (basic check)
	if not self._check_no_cycles(tokens):
	self.errors.append("Dependency cycle detected")

	# Check required fields are not empty (except where allowed)
	for token in tokens:
	if not token['form'] or token['form'] == '_':
	self.errors.append(f"Token {token['id']}: Empty or missing form")
	if not token['upos'] or token['upos'] == '_':
	self.errors.append(f"Token {token['id']}: Empty or missing UPOS")
	if not token['deprel'] or token['deprel'] == '_':
	self.errors.append(f"Token {token['id']}: Empty or missing deprel")

	return len(self.errors) == 0

	def _check_no_cycles(self, tokens):
	"""Basic cycle detection in dependency tree."""
	token_heads = {t['id']: t['head'] for t in tokens}

	for start_token in tokens:
	visited = set()
	current = start_token['id']

	while current != 0 and current in token_heads:
	if current in visited:
	return False # Cycle detected
	visited.add(current)
	current = token_heads[current]

	return True

	def get_errors(self):
	"""Get list of validation errors."""
	return self.errors


	def clean_conllu_file(input_file, output_file, verbose=False):
	"""
	Clean a CoNLL-U file by removing invalid sentences.

	Args:
	input_file: Path to input CoNLL-U file
	output_file: Path to output cleaned file
	verbose: Whether to print detailed information
	"""
	validator = CoNLLUValidator()

	valid_sentences = 0
	invalid_sentences = 0
	total_sentences = 0

	with open(input_file, 'r', encoding='utf-8') as infile, \
	open(output_file, 'w', encoding='utf-8') as outfile:

	current_sentence = []
	current_comments = []
	sent_id = None

	for line_num, line in enumerate(infile, 1):
	line = line.rstrip('\n')

	if line.startswith('#'):
	current_comments.append(line)
	if line.startswith('# sent_id'):
	sent_id = line.split('=', 1)[1].strip() if '=' in line else str(total_sentences + 1)
	elif line.strip() == '':
	# End of sentence
	if current_sentence:
	total_sentences += 1

	if validator.validate_sentence(current_sentence, sent_id):
	# Write valid sentence
	for comment in current_comments:
	outfile.write(comment + '\n')
	for token_line in current_sentence:
	outfile.write(token_line + '\n')
	outfile.write('\n')
	valid_sentences += 1
	else:
	invalid_sentences += 1
	if verbose:
	print(f"Removed sentence {sent_id} (line ~{line_num}): {'; '.join(validator.get_errors())}")

	current_sentence = []
	current_comments = []
	sent_id = None
	else:
	# Token line
	current_sentence.append(line)

	# Handle last sentence if file doesn't end with empty line
	if current_sentence:
	total_sentences += 1

	if validator.validate_sentence(current_sentence, sent_id):
	for comment in current_comments:
	outfile.write(comment + '\n')
	for token_line in current_sentence:
	outfile.write(token_line + '\n')
	outfile.write('\n')
	valid_sentences += 1
	else:
	invalid_sentences += 1
	if verbose:
	print(f"Removed sentence {sent_id}: {'; '.join(validator.get_errors())}")

	print(f"Processing complete:")
	print(f" Total sentences: {total_sentences}")
	print(f" Valid sentences: {valid_sentences}")
	print(f" Invalid sentences removed: {invalid_sentences}")
	print(f" Success rate: {valid_sentences/total_sentences*100:.1f}%" if total_sentences > 0 else " Success rate: 0%")


	def main():
	parser = argparse.ArgumentParser(description='Clean CoNLL-U files by removing invalid sentences')
	parser.add_argument('input_file', help='Input CoNLL-U file')
	parser.add_argument('output_file', help='Output cleaned CoNLL-U file')
	parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')

	args = parser.parse_args()

	try:
	clean_conllu_file(args.input_file, args.output_file, args.verbose)
	except FileNotFoundError:
	print(f"Error: Input file '{args.input_file}' not found")
	sys.exit(1)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)


	if __name__ == '__main__':
	main()