stanza-digphil / stanza /utils /datasets /tokenization /process_thai_tokenization.py
Albin Thörn Cleland
Clean initial commit with LFS
19b8775
import os
import random
try:
from pythainlp import sent_tokenize
except ImportError:
pass
def write_section(output_dir, dataset_name, section, documents):
"""
Writes a list of documents for tokenization, including a file in conll format
The Thai datasets generally have no MWT (apparently not relevant for Thai)
output_dir: the destination directory for the output files
dataset_name: orchid, BEST, lst20, etc
section: train/dev/test
documents: a nested list of documents, paragraphs, sentences, words
words is a list of (word, space_follows)
"""
with open(os.path.join(output_dir, 'th_%s-ud-%s-mwt.json' % (dataset_name, section)), 'w') as fout:
fout.write("[]\n")
text_out = open(os.path.join(output_dir, 'th_%s.%s.txt' % (dataset_name, section)), 'w')
label_out = open(os.path.join(output_dir, 'th_%s-ud-%s.toklabels' % (dataset_name, section)), 'w')
for document in documents:
for paragraph in document:
for sentence_idx, sentence in enumerate(paragraph):
for word_idx, word in enumerate(sentence):
# TODO: split with newlines to make it more readable?
text_out.write(word[0])
for i in range(len(word[0]) - 1):
label_out.write("0")
if word_idx == len(sentence) - 1:
label_out.write("2")
else:
label_out.write("1")
if word[1] and (sentence_idx != len(paragraph) - 1 or word_idx != len(sentence) - 1):
text_out.write(' ')
label_out.write('0')
text_out.write("\n\n")
label_out.write("\n\n")
text_out.close()
label_out.close()
with open(os.path.join(output_dir, 'th_%s.%s.gold.conllu' % (dataset_name, section)), 'w') as fout:
for document in documents:
for paragraph in document:
new_par = True
for sentence in paragraph:
for word_idx, word in enumerate(sentence):
# SpaceAfter is left blank if there is space after the word
if word[1] and new_par:
space = 'NewPar=Yes'
elif word[1]:
space = '_'
elif new_par:
space = 'SpaceAfter=No|NewPar=Yes'
else:
space = 'SpaceAfter=No'
new_par = False
# Note the faked dependency structure: the conll reading code
# needs it even if it isn't being used in any way
fake_dep = 'root' if word_idx == 0 else 'dep'
fout.write('{}\t{}\t_\t_\t_\t_\t{}\t{}\t{}:{}\t{}\n'.format(word_idx+1, word[0], word_idx, fake_dep, word_idx, fake_dep, space))
fout.write('\n')
def write_dataset(documents, output_dir, dataset_name):
"""
Shuffle a list of documents, write three sections
"""
random.shuffle(documents)
num_train = int(len(documents) * 0.8)
num_dev = int(len(documents) * 0.1)
os.makedirs(output_dir, exist_ok=True)
write_section(output_dir, dataset_name, 'train', documents[:num_train])
write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
write_section(output_dir, dataset_name, 'test', documents[num_train+num_dev:])
def write_dataset_best(documents, test_documents, output_dir, dataset_name):
"""
Shuffle a list of documents, write three sections
"""
random.shuffle(documents)
num_train = int(len(documents) * 0.85)
num_dev = int(len(documents) * 0.15)
os.makedirs(output_dir, exist_ok=True)
write_section(output_dir, dataset_name, 'train', documents[:num_train])
write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
write_section(output_dir, dataset_name, 'test', test_documents)
def reprocess_lines(processed_lines):
"""
Reprocesses lines using pythainlp to cut up sentences into shorter sentences.
Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers.
Input: a list of lines, where each line is a list of words. Space characters can be included as words
Output: a new list of lines, resplit using pythainlp
"""
reprocessed_lines = []
for line in processed_lines:
text = "".join(line)
try:
chunks = sent_tokenize(text)
except NameError as e:
raise NameError("Sentences cannot be reprocessed without first installing pythainlp") from e
# Check that the total text back is the same as the text in
if sum(len(x) for x in chunks) != len(text):
raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(text, chunks))
chunk_lengths = [len(x) for x in chunks]
current_length = 0
new_line = []
for word in line:
if len(word) + current_length < chunk_lengths[0]:
new_line.append(word)
current_length = current_length + len(word)
elif len(word) + current_length == chunk_lengths[0]:
new_line.append(word)
reprocessed_lines.append(new_line)
new_line = []
chunk_lengths = chunk_lengths[1:]
current_length = 0
else:
remaining_len = chunk_lengths[0] - current_length
new_line.append(word[:remaining_len])
reprocessed_lines.append(new_line)
word = word[remaining_len:]
chunk_lengths = chunk_lengths[1:]
while len(word) > chunk_lengths[0]:
new_line = [word[:chunk_lengths[0]]]
reprocessed_lines.append(new_line)
word = word[chunk_lengths[0]:]
chunk_lengths = chunk_lengths[1:]
new_line = [word]
current_length = len(word)
reprocessed_lines.append(new_line)
return reprocessed_lines
def convert_processed_lines(processed_lines):
"""
Convert a list of sentences into documents suitable for the output methods in this module.
Input: a list of lines, including space words
Output: a list of documents, each document containing a list of sentences
Each sentence is a list of words: (text, space_follows)
Space words will be eliminated.
"""
paragraphs = []
sentences = []
for words in processed_lines:
# turn the words into a sentence
if len(words) > 1 and " " == words[0]:
words = words[1:]
elif len(words) == 1 and " " == words[0]:
words = []
sentence = []
for word in words:
word = word.strip()
if not word:
if len(sentence) == 0:
print(word)
raise ValueError("Unexpected space at start of sentence in document {}".format(filename))
sentence[-1] = (sentence[-1][0], True)
else:
sentence.append((word, False))
# blank lines are very rare in best, but why not treat them as a paragraph break
if len(sentence) == 0:
paragraphs.append([sentences])
sentences = []
continue
sentence[-1] = (sentence[-1][0], True)
sentences.append(sentence)
paragraphs.append([sentences])
return paragraphs