stanza-digphil / stanza /utils /datasets /tokenization /process_thai_tokenization.py

Albin Thörn Cleland

Clean initial commit with LFS

19b8775 about 1 month ago

7.7 kB

	import os
	import random

	try:
	from pythainlp import sent_tokenize
	except ImportError:
	pass

	def write_section(output_dir, dataset_name, section, documents):
	"""
	Writes a list of documents for tokenization, including a file in conll format

	The Thai datasets generally have no MWT (apparently not relevant for Thai)

	output_dir: the destination directory for the output files
	dataset_name: orchid, BEST, lst20, etc
	section: train/dev/test
	documents: a nested list of documents, paragraphs, sentences, words
	words is a list of (word, space_follows)
	"""
	with open(os.path.join(output_dir, 'th_%s-ud-%s-mwt.json' % (dataset_name, section)), 'w') as fout:
	fout.write("[]\n")

	text_out = open(os.path.join(output_dir, 'th_%s.%s.txt' % (dataset_name, section)), 'w')
	label_out = open(os.path.join(output_dir, 'th_%s-ud-%s.toklabels' % (dataset_name, section)), 'w')
	for document in documents:
	for paragraph in document:
	for sentence_idx, sentence in enumerate(paragraph):
	for word_idx, word in enumerate(sentence):
	# TODO: split with newlines to make it more readable?
	text_out.write(word[0])
	for i in range(len(word[0]) - 1):
	label_out.write("0")
	if word_idx == len(sentence) - 1:
	label_out.write("2")
	else:
	label_out.write("1")
	if word[1] and (sentence_idx != len(paragraph) - 1 or word_idx != len(sentence) - 1):
	text_out.write(' ')
	label_out.write('0')

	text_out.write("\n\n")
	label_out.write("\n\n")

	text_out.close()
	label_out.close()

	with open(os.path.join(output_dir, 'th_%s.%s.gold.conllu' % (dataset_name, section)), 'w') as fout:
	for document in documents:
	for paragraph in document:
	new_par = True
	for sentence in paragraph:
	for word_idx, word in enumerate(sentence):
	# SpaceAfter is left blank if there is space after the word
	if word[1] and new_par:
	space = 'NewPar=Yes'
	elif word[1]:
	space = '_'
	elif new_par:
	space = 'SpaceAfter=No\|NewPar=Yes'
	else:
	space = 'SpaceAfter=No'
	new_par = False

	# Note the faked dependency structure: the conll reading code
	# needs it even if it isn't being used in any way
	fake_dep = 'root' if word_idx == 0 else 'dep'
	fout.write('{}\t{}\t_\t_\t_\t_\t{}\t{}\t{}:{}\t{}\n'.format(word_idx+1, word[0], word_idx, fake_dep, word_idx, fake_dep, space))
	fout.write('\n')

	def write_dataset(documents, output_dir, dataset_name):
	"""
	Shuffle a list of documents, write three sections
	"""
	random.shuffle(documents)
	num_train = int(len(documents) * 0.8)
	num_dev = int(len(documents) * 0.1)
	os.makedirs(output_dir, exist_ok=True)
	write_section(output_dir, dataset_name, 'train', documents[:num_train])
	write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
	write_section(output_dir, dataset_name, 'test', documents[num_train+num_dev:])

	def write_dataset_best(documents, test_documents, output_dir, dataset_name):
	"""
	Shuffle a list of documents, write three sections
	"""
	random.shuffle(documents)
	num_train = int(len(documents) * 0.85)
	num_dev = int(len(documents) * 0.15)
	os.makedirs(output_dir, exist_ok=True)
	write_section(output_dir, dataset_name, 'train', documents[:num_train])
	write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
	write_section(output_dir, dataset_name, 'test', test_documents)


	def reprocess_lines(processed_lines):
	"""
	Reprocesses lines using pythainlp to cut up sentences into shorter sentences.

	Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers.

	Input: a list of lines, where each line is a list of words. Space characters can be included as words
	Output: a new list of lines, resplit using pythainlp
	"""
	reprocessed_lines = []
	for line in processed_lines:
	text = "".join(line)
	try:
	chunks = sent_tokenize(text)
	except NameError as e:
	raise NameError("Sentences cannot be reprocessed without first installing pythainlp") from e
	# Check that the total text back is the same as the text in
	if sum(len(x) for x in chunks) != len(text):
	raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(text, chunks))

	chunk_lengths = [len(x) for x in chunks]

	current_length = 0
	new_line = []
	for word in line:
	if len(word) + current_length < chunk_lengths[0]:
	new_line.append(word)
	current_length = current_length + len(word)
	elif len(word) + current_length == chunk_lengths[0]:
	new_line.append(word)
	reprocessed_lines.append(new_line)
	new_line = []
	chunk_lengths = chunk_lengths[1:]
	current_length = 0
	else:
	remaining_len = chunk_lengths[0] - current_length
	new_line.append(word[:remaining_len])
	reprocessed_lines.append(new_line)
	word = word[remaining_len:]
	chunk_lengths = chunk_lengths[1:]
	while len(word) > chunk_lengths[0]:
	new_line = [word[:chunk_lengths[0]]]
	reprocessed_lines.append(new_line)
	word = word[chunk_lengths[0]:]
	chunk_lengths = chunk_lengths[1:]
	new_line = [word]
	current_length = len(word)
	reprocessed_lines.append(new_line)
	return reprocessed_lines

	def convert_processed_lines(processed_lines):
	"""
	Convert a list of sentences into documents suitable for the output methods in this module.

	Input: a list of lines, including space words
	Output: a list of documents, each document containing a list of sentences
	Each sentence is a list of words: (text, space_follows)
	Space words will be eliminated.
	"""
	paragraphs = []
	sentences = []
	for words in processed_lines:
	# turn the words into a sentence
	if len(words) > 1 and " " == words[0]:
	words = words[1:]
	elif len(words) == 1 and " " == words[0]:
	words = []

	sentence = []
	for word in words:
	word = word.strip()
	if not word:
	if len(sentence) == 0:
	print(word)
	raise ValueError("Unexpected space at start of sentence in document {}".format(filename))
	sentence[-1] = (sentence[-1][0], True)
	else:
	sentence.append((word, False))
	# blank lines are very rare in best, but why not treat them as a paragraph break
	if len(sentence) == 0:
	paragraphs.append([sentences])
	sentences = []
	continue
	sentence[-1] = (sentence[-1][0], True)
	sentences.append(sentence)
	paragraphs.append([sentences])
	return paragraphs