| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import re | |
| SPACE_NORMALIZER = re.compile(r"\s+") | |
| def tokenize_line(line): | |
| if not isinstance(line, str): | |
| return line | |
| line = SPACE_NORMALIZER.sub(" ", line) | |
| line = line.strip() | |
| return line.split() | |
| def char_tokenizer(line): | |
| line = line.strip().replace(' ', '|')+'|' | |
| char_list = [] | |
| char_list[:0] = line | |
| return char_list | |