gen-question / src /services /AI /keyword_extractor.py
linhnguyen02
init commit
d3530f3
"""This module contains keyword word extraction using Key-Bert model
https://github.com/MaartenGr/KeyBERT
https://github.com/TimSchopf/KeyphraseVectorizers
@Author: Karthick T. Sharma
"""
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
class KeywordExtractor:
"""Extract keyword from context."""
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(KeywordExtractor, cls).__new__(cls)
cls._instance._init_model()
return cls._instance
def _init_model(self):
"""Initialize keyword extraction model only once."""
self.__kw_model = KeyBERT()
self.__vectorizer = KeyphraseCountVectorizer()
# def __init__(self):
# """Initialize keyword extration model (KeyBERT) and keypharse vectorizer
# for meaningful keywords.
# """
# self.__kw_model = KeyBERT()
# self.__vectorizer = KeyphraseCountVectorizer()
def __extract_keywords(self, text):
"""Extract keywords from corpus using KeyBERT.
Args:
text (str): corpus used to extract keywords.
Returns:
list[str]: list of keywords extracted from input corpus.
"""
kwx = self.__kw_model.extract_keywords(
text, vectorizer=self.__vectorizer)
kw_ls = []
for i in kwx:
# 0 -> keyword, 1-> confidence / probability
kw_ls.append(i[0])
return kw_ls
def filter_keywords(self, original, summarized):
"""Extract keywords from both summary and original text and only return keywords
which are common.
Args:
original (str): original corpus.
summarized (str): summarized corpus.
Returns:
list(str): list of keywords common for both corpus.
"""
orig_ls = set(self.__extract_keywords(original))
sum_ls = self.__extract_keywords(summarized)
return list(orig_ls.intersection(sum_ls))
def get_keywords(self, original_list, summarized_list):
"""Return keywords from input corpus
Args:
original_list (str): list of original corpus.
summarized_list (str): list of summarized corpus.
Returns:
list(list(str)): list of keywords common for both corpus.
"""
kw_list = []
for orig, sum_ in zip(original_list, summarized_list):
kw_list.append(self.filter_keywords(orig, sum_))
return kw_list