part_of_speech / toolbox /tokenization /pyltp_tokenization.py
HoneyTian's picture
update
5839f86
raw
history blame contribute delete
579 Bytes
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from functools import lru_cache
import os
from typing import List
ltp_data_dir = os.environ.get("LTP_DATA_DIR")
from pyltp import Segmentor
@lru_cache(maxsize=5)
def get_pyltp_tokenizer():
global ltp_data_dir
cws_model_path = os.path.join(ltp_data_dir, "cws.model")
segmentor = Segmentor(cws_model_path)
return segmentor
def pyltp_tokenize(text: str, language: str) -> List[str]:
segmentor = get_pyltp_tokenizer()
words = segmentor.segment(text)
return words
if __name__ == "__main__":
pass