#!/usr/bin/python3 # -*- coding: utf-8 -*- from functools import lru_cache import os from typing import List ltp_data_dir = os.environ.get("LTP_DATA_DIR") from pyltp import Segmentor @lru_cache(maxsize=5) def get_pyltp_tokenizer(): global ltp_data_dir cws_model_path = os.path.join(ltp_data_dir, "cws.model") segmentor = Segmentor(cws_model_path) return segmentor def pyltp_tokenize(text: str, language: str) -> List[str]: segmentor = get_pyltp_tokenizer() words = segmentor.segment(text) return words if __name__ == "__main__": pass