resumematcher / scripts /KeytermsExtraction.py
March
first
46917c3
import textacy
from textacy import extract
class KeytermExtractor:
"""
A class for extracting keyterms from a given text using various algorithms.
"""
def __init__(self, raw_text: str, top_n_values: int = 20):
"""
Initialize the KeytermExtractor object.
Args:
raw_text (str): The raw input text.
top_n_values (int): The number of top keyterms to extract.
"""
self.raw_text = raw_text
self.text_doc = textacy.make_spacy_doc(self.raw_text, lang="en_core_web_md")
self.top_n_values = top_n_values
def get_keyterms_based_on_textrank(self):
"""
Extract keyterms using the TextRank algorithm.
Returns:
List[str]: A list of top keyterms based on TextRank.
"""
return list(
extract.keyterms.textrank(
self.text_doc, normalize="lemma", topn=self.top_n_values
)
)
def get_keyterms_based_on_sgrank(self):
"""
Extract keyterms using the SGRank algorithm.
Returns:
List[str]: A list of top keyterms based on SGRank.
"""
return list(
extract.keyterms.sgrank(
self.text_doc, normalize="lemma", topn=self.top_n_values
)
)
def get_keyterms_based_on_scake(self):
"""
Extract keyterms using the sCAKE algorithm.
Returns:
List[str]: A list of top keyterms based on sCAKE.
"""
return list(
extract.keyterms.scake(
self.text_doc, normalize="lemma", topn=self.top_n_values
)
)
def get_keyterms_based_on_yake(self):
"""
Extract keyterms using the YAKE algorithm.
Returns:
List[str]: A list of top keyterms based on YAKE.
"""
return list(
extract.keyterms.yake(
self.text_doc, normalize="lemma", topn=self.top_n_values
)
)
def bi_gramchunker(self):
"""
Chunk the text into bigrams.
Returns:
List[str]: A list of bigrams.
"""
return list(
textacy.extract.basics.ngrams(
self.text_doc,
n=2,
filter_stops=True,
filter_nums=True,
filter_punct=True,
)
)
def tri_gramchunker(self):
"""
Chunk the text into trigrams.
Returns:
List[str]: A list of trigrams.
"""
return list(
textacy.extract.basics.ngrams(
self.text_doc,
n=3,
filter_stops=True,
filter_nums=True,
filter_punct=True,
)
)