Spaces:
Running
Running
| from functools import lru_cache | |
| import os | |
| from typing import Dict, List, Final, Optional | |
| import warnings | |
| import nltk | |
| import jieba | |
| warnings.filterwarnings( | |
| "ignore", | |
| category=UserWarning, | |
| module=r"jieba\._compat" | |
| ) | |
| class NLTKHelper: | |
| """ | |
| NLTK helper class | |
| """ | |
| SUPPORTED_LANGUAGES: Final[Dict[str, str]] = { | |
| "en": "english", | |
| "zh": "chinese" | |
| } | |
| _NLTK_PACKAGES: Final[Dict[str, str]] = { | |
| "stopwords": "corpora", | |
| "punkt_tab": "tokenizers" | |
| } | |
| def __init__(self, nltk_data_path: Optional[str] = None): | |
| self._nltk_path = nltk_data_path or os.path.join( | |
| os.path.dirname(os.path.dirname(__file__)), | |
| "resources", | |
| "nltk_data" | |
| ) | |
| nltk.data.path.append(self._nltk_path) | |
| jieba.initialize() | |
| self._ensure_nltk_data("stopwords") | |
| self._ensure_nltk_data("punkt_tab") | |
| def _ensure_nltk_data(self, package_name: str) -> None: | |
| """ | |
| ensure nltk data is downloaded | |
| """ | |
| try: | |
| nltk.data.find(f"{self._NLTK_PACKAGES[package_name]}/{package_name}") | |
| except LookupError: | |
| nltk.download(package_name, download_dir=self._nltk_path, quiet=True) | |
| def get_stopwords(self, lang: str) -> List[str]: | |
| if lang not in self.SUPPORTED_LANGUAGES: | |
| raise ValueError(f"Language {lang} is not supported.") | |
| return nltk.corpus.stopwords.words(self.SUPPORTED_LANGUAGES[lang]) | |
| def word_tokenize(self, text: str, lang: str) -> List[str]: | |
| if lang not in self.SUPPORTED_LANGUAGES: | |
| raise ValueError(f"Language {lang} is not supported.") | |
| if lang == "zh": | |
| return jieba.lcut(text) | |
| return nltk.word_tokenize(text) | |