File size: 538 Bytes
59b7eeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
SPACE_NORMALIZER = re.compile(r"\s+")
def tokenize_line(line):
if not isinstance(line, str):
return line
line = SPACE_NORMALIZER.sub(" ", line)
line = line.strip()
return line.split()
def char_tokenizer(line):
line = line.strip().replace(' ', '|')+'|'
char_list = []
char_list[:0] = line
return char_list
|