File size: 575 Bytes
5839f86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from typing import Callable, Dict, List

from toolbox.tokenization.pyltp_tokenization import pyltp_tokenize


language_to_engines = {
    "chinese": ["pyltp"]
}


engine_to_tagger: Dict[str, Callable] = {
    "pyltp": pyltp_tokenize
}


def tokenize(text: str, language: str, engine: str) -> List[str]:
    tokenizer = engine_to_tagger.get(engine)
    if tokenizer is None:
        raise AssertionError(f"engine {engine} not supported.")

    words = tokenizer(text, language)
    return words


if __name__ == "__main__":
    pass