GraphGen / graphgen /utils /help_nltk.py
github-actions[bot]
Auto-sync from demo at Fri Dec 26 08:29:01 UTC 2025
7566ac3
from functools import lru_cache
import os
from typing import Dict, List, Final, Optional
import warnings
import nltk
import jieba
warnings.filterwarnings(
"ignore",
category=UserWarning,
module=r"jieba\._compat"
)
class NLTKHelper:
"""
NLTK helper class
"""
SUPPORTED_LANGUAGES: Final[Dict[str, str]] = {
"en": "english",
"zh": "chinese"
}
_NLTK_PACKAGES: Final[Dict[str, str]] = {
"stopwords": "corpora",
"punkt_tab": "tokenizers"
}
def __init__(self, nltk_data_path: Optional[str] = None):
self._nltk_path = nltk_data_path or os.path.join(
os.path.dirname(os.path.dirname(__file__)),
"resources",
"nltk_data"
)
nltk.data.path.append(self._nltk_path)
jieba.initialize()
self._ensure_nltk_data("stopwords")
self._ensure_nltk_data("punkt_tab")
def _ensure_nltk_data(self, package_name: str) -> None:
"""
ensure nltk data is downloaded
"""
try:
nltk.data.find(f"{self._NLTK_PACKAGES[package_name]}/{package_name}")
except LookupError:
nltk.download(package_name, download_dir=self._nltk_path, quiet=True)
@lru_cache(maxsize=2)
def get_stopwords(self, lang: str) -> List[str]:
if lang not in self.SUPPORTED_LANGUAGES:
raise ValueError(f"Language {lang} is not supported.")
return nltk.corpus.stopwords.words(self.SUPPORTED_LANGUAGES[lang])
def word_tokenize(self, text: str, lang: str) -> List[str]:
if lang not in self.SUPPORTED_LANGUAGES:
raise ValueError(f"Language {lang} is not supported.")
if lang == "zh":
return jieba.lcut(text)
return nltk.word_tokenize(text)