import os import random try: from pythainlp import sent_tokenize except ImportError: pass def write_section(output_dir, dataset_name, section, documents): """ Writes a list of documents for tokenization, including a file in conll format The Thai datasets generally have no MWT (apparently not relevant for Thai) output_dir: the destination directory for the output files dataset_name: orchid, BEST, lst20, etc section: train/dev/test documents: a nested list of documents, paragraphs, sentences, words words is a list of (word, space_follows) """ with open(os.path.join(output_dir, 'th_%s-ud-%s-mwt.json' % (dataset_name, section)), 'w') as fout: fout.write("[]\n") text_out = open(os.path.join(output_dir, 'th_%s.%s.txt' % (dataset_name, section)), 'w') label_out = open(os.path.join(output_dir, 'th_%s-ud-%s.toklabels' % (dataset_name, section)), 'w') for document in documents: for paragraph in document: for sentence_idx, sentence in enumerate(paragraph): for word_idx, word in enumerate(sentence): # TODO: split with newlines to make it more readable? text_out.write(word[0]) for i in range(len(word[0]) - 1): label_out.write("0") if word_idx == len(sentence) - 1: label_out.write("2") else: label_out.write("1") if word[1] and (sentence_idx != len(paragraph) - 1 or word_idx != len(sentence) - 1): text_out.write(' ') label_out.write('0') text_out.write("\n\n") label_out.write("\n\n") text_out.close() label_out.close() with open(os.path.join(output_dir, 'th_%s.%s.gold.conllu' % (dataset_name, section)), 'w') as fout: for document in documents: for paragraph in document: new_par = True for sentence in paragraph: for word_idx, word in enumerate(sentence): # SpaceAfter is left blank if there is space after the word if word[1] and new_par: space = 'NewPar=Yes' elif word[1]: space = '_' elif new_par: space = 'SpaceAfter=No|NewPar=Yes' else: space = 'SpaceAfter=No' new_par = False # Note the faked dependency structure: the conll reading code # needs it even if it isn't being used in any way fake_dep = 'root' if word_idx == 0 else 'dep' fout.write('{}\t{}\t_\t_\t_\t_\t{}\t{}\t{}:{}\t{}\n'.format(word_idx+1, word[0], word_idx, fake_dep, word_idx, fake_dep, space)) fout.write('\n') def write_dataset(documents, output_dir, dataset_name): """ Shuffle a list of documents, write three sections """ random.shuffle(documents) num_train = int(len(documents) * 0.8) num_dev = int(len(documents) * 0.1) os.makedirs(output_dir, exist_ok=True) write_section(output_dir, dataset_name, 'train', documents[:num_train]) write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev]) write_section(output_dir, dataset_name, 'test', documents[num_train+num_dev:]) def write_dataset_best(documents, test_documents, output_dir, dataset_name): """ Shuffle a list of documents, write three sections """ random.shuffle(documents) num_train = int(len(documents) * 0.85) num_dev = int(len(documents) * 0.15) os.makedirs(output_dir, exist_ok=True) write_section(output_dir, dataset_name, 'train', documents[:num_train]) write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev]) write_section(output_dir, dataset_name, 'test', test_documents) def reprocess_lines(processed_lines): """ Reprocesses lines using pythainlp to cut up sentences into shorter sentences. Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers. Input: a list of lines, where each line is a list of words. Space characters can be included as words Output: a new list of lines, resplit using pythainlp """ reprocessed_lines = [] for line in processed_lines: text = "".join(line) try: chunks = sent_tokenize(text) except NameError as e: raise NameError("Sentences cannot be reprocessed without first installing pythainlp") from e # Check that the total text back is the same as the text in if sum(len(x) for x in chunks) != len(text): raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(text, chunks)) chunk_lengths = [len(x) for x in chunks] current_length = 0 new_line = [] for word in line: if len(word) + current_length < chunk_lengths[0]: new_line.append(word) current_length = current_length + len(word) elif len(word) + current_length == chunk_lengths[0]: new_line.append(word) reprocessed_lines.append(new_line) new_line = [] chunk_lengths = chunk_lengths[1:] current_length = 0 else: remaining_len = chunk_lengths[0] - current_length new_line.append(word[:remaining_len]) reprocessed_lines.append(new_line) word = word[remaining_len:] chunk_lengths = chunk_lengths[1:] while len(word) > chunk_lengths[0]: new_line = [word[:chunk_lengths[0]]] reprocessed_lines.append(new_line) word = word[chunk_lengths[0]:] chunk_lengths = chunk_lengths[1:] new_line = [word] current_length = len(word) reprocessed_lines.append(new_line) return reprocessed_lines def convert_processed_lines(processed_lines): """ Convert a list of sentences into documents suitable for the output methods in this module. Input: a list of lines, including space words Output: a list of documents, each document containing a list of sentences Each sentence is a list of words: (text, space_follows) Space words will be eliminated. """ paragraphs = [] sentences = [] for words in processed_lines: # turn the words into a sentence if len(words) > 1 and " " == words[0]: words = words[1:] elif len(words) == 1 and " " == words[0]: words = [] sentence = [] for word in words: word = word.strip() if not word: if len(sentence) == 0: print(word) raise ValueError("Unexpected space at start of sentence in document {}".format(filename)) sentence[-1] = (sentence[-1][0], True) else: sentence.append((word, False)) # blank lines are very rare in best, but why not treat them as a paragraph break if len(sentence) == 0: paragraphs.append([sentences]) sentences = [] continue sentence[-1] = (sentence[-1][0], True) sentences.append(sentence) paragraphs.append([sentences]) return paragraphs