Spaces:
Runtime error
Runtime error
| import re | |
| from nltk import sent_tokenize, word_tokenize | |
| from nltk.tokenize.treebank import TreebankWordDetokenizer | |
| detokenizer = TreebankWordDetokenizer() | |
| def handle_dounble_quote(sent): | |
| cur_str = '' | |
| exp_left = True | |
| ignore_space = False | |
| for char in sent: | |
| if char == '"': | |
| if exp_left: #this is a left " | |
| cur_str = cur_str.rstrip() + ' "' | |
| exp_left = (not exp_left) | |
| ignore_space = True | |
| else: #this is a right " | |
| cur_str = cur_str.rstrip() + '" ' | |
| exp_left = (not exp_left) | |
| ignore_space = False | |
| else: | |
| if ignore_space: #expecting right | |
| if char == ' ': | |
| continue | |
| else: | |
| cur_str = cur_str + char | |
| ignore_space = False | |
| else: | |
| cur_str = cur_str + char | |
| cur_str = cur_str.strip() | |
| cur_str = re.sub(r'[ ]+', ' ', cur_str) | |
| return cur_str | |
| def postprocess_space(sent): | |
| sent = re.sub(r'[ ]+\.', '.', sent) | |
| sent = re.sub(r'[ ]+,', ',', sent) | |
| sent = re.sub(r'[ ]+!', '!', sent) | |
| sent = re.sub(r'[ ]+\?', '?', sent) | |
| sent = re.sub(r'\([ ]+', '(', sent) | |
| sent = re.sub(r'[ ]+\)', ')', sent) | |
| sent = re.sub(r' \'s( |\.|,|!|\?)', r"'s\1", sent) | |
| sent = re.sub(r'n \'t( |\.|,|!|\?)', r"n't\1", sent) | |
| return sent | |
| def detokenize_sent(sent): | |
| #Clean raw sent | |
| sent = re.sub(r'\' s ', '\'s ', sent) | |
| toks = sent.split() | |
| if len([1 for t in toks if t=="'"]) % 2 == 0: | |
| toks = ['"' if t=="'" else t for t in toks] | |
| sent = ' '.join(toks) | |
| # | |
| sents = sent_tokenize(sent) | |
| final_sents = [] | |
| for _sent in sents: | |
| _sent = detokenizer.detokenize(_sent.split()) | |
| res = handle_dounble_quote(_sent) | |
| if res == -1: | |
| print ('unbalanced double quote') | |
| print (_sent) | |
| else: | |
| _sent = res | |
| final_sents.append(_sent) | |
| sent = ' '.join(final_sents) | |
| sent = postprocess_space(sent) | |
| return sent | |