"""This module contains keyword word extraction using Key-Bert model https://github.com/MaartenGr/KeyBERT https://github.com/TimSchopf/KeyphraseVectorizers @Author: Karthick T. Sharma """ from keyphrase_vectorizers import KeyphraseCountVectorizer from keybert import KeyBERT class KeywordExtractor: """Extract keyword from context.""" _instance = None def __new__(cls): if cls._instance is None: cls._instance = super(KeywordExtractor, cls).__new__(cls) cls._instance._init_model() return cls._instance def _init_model(self): """Initialize keyword extraction model only once.""" self.__kw_model = KeyBERT() self.__vectorizer = KeyphraseCountVectorizer() # def __init__(self): # """Initialize keyword extration model (KeyBERT) and keypharse vectorizer # for meaningful keywords. # """ # self.__kw_model = KeyBERT() # self.__vectorizer = KeyphraseCountVectorizer() def __extract_keywords(self, text): """Extract keywords from corpus using KeyBERT. Args: text (str): corpus used to extract keywords. Returns: list[str]: list of keywords extracted from input corpus. """ kwx = self.__kw_model.extract_keywords( text, vectorizer=self.__vectorizer) kw_ls = [] for i in kwx: # 0 -> keyword, 1-> confidence / probability kw_ls.append(i[0]) return kw_ls def filter_keywords(self, original, summarized): """Extract keywords from both summary and original text and only return keywords which are common. Args: original (str): original corpus. summarized (str): summarized corpus. Returns: list(str): list of keywords common for both corpus. """ orig_ls = set(self.__extract_keywords(original)) sum_ls = self.__extract_keywords(summarized) return list(orig_ls.intersection(sum_ls)) def get_keywords(self, original_list, summarized_list): """Return keywords from input corpus Args: original_list (str): list of original corpus. summarized_list (str): list of summarized corpus. Returns: list(list(str)): list of keywords common for both corpus. """ kw_list = [] for orig, sum_ in zip(original_list, summarized_list): kw_list.append(self.filter_keywords(orig, sum_)) return kw_list