File size: 538 Bytes
59b7eeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import re


SPACE_NORMALIZER = re.compile(r"\s+")


def tokenize_line(line):
    if not isinstance(line, str):
        return line
    line = SPACE_NORMALIZER.sub(" ", line)
    line = line.strip()
    return line.split()

def char_tokenizer(line):
    line = line.strip().replace(' ', '|')+'|'
    char_list = []
    char_list[:0] = line
    return char_list