Spaces:

marchji2415
/

resumematcher

Sleeping

resumematcher / scripts /KeytermsExtraction.py

March

first

46917c3 9 months ago

2.82 kB

	import textacy
	from textacy import extract


	class KeytermExtractor:
	"""
	A class for extracting keyterms from a given text using various algorithms.
	"""

	def __init__(self, raw_text: str, top_n_values: int = 20):
	"""
	Initialize the KeytermExtractor object.

	Args:
	raw_text (str): The raw input text.
	top_n_values (int): The number of top keyterms to extract.
	"""
	self.raw_text = raw_text
	self.text_doc = textacy.make_spacy_doc(self.raw_text, lang="en_core_web_md")
	self.top_n_values = top_n_values

	def get_keyterms_based_on_textrank(self):
	"""
	Extract keyterms using the TextRank algorithm.

	Returns:
	List[str]: A list of top keyterms based on TextRank.
	"""
	return list(
	extract.keyterms.textrank(
	self.text_doc, normalize="lemma", topn=self.top_n_values
	)
	)

	def get_keyterms_based_on_sgrank(self):
	"""
	Extract keyterms using the SGRank algorithm.

	Returns:
	List[str]: A list of top keyterms based on SGRank.
	"""
	return list(
	extract.keyterms.sgrank(
	self.text_doc, normalize="lemma", topn=self.top_n_values
	)
	)

	def get_keyterms_based_on_scake(self):
	"""
	Extract keyterms using the sCAKE algorithm.

	Returns:
	List[str]: A list of top keyterms based on sCAKE.
	"""
	return list(
	extract.keyterms.scake(
	self.text_doc, normalize="lemma", topn=self.top_n_values
	)
	)

	def get_keyterms_based_on_yake(self):
	"""
	Extract keyterms using the YAKE algorithm.

	Returns:
	List[str]: A list of top keyterms based on YAKE.
	"""
	return list(
	extract.keyterms.yake(
	self.text_doc, normalize="lemma", topn=self.top_n_values
	)
	)

	def bi_gramchunker(self):
	"""
	Chunk the text into bigrams.

	Returns:
	List[str]: A list of bigrams.
	"""
	return list(
	textacy.extract.basics.ngrams(
	self.text_doc,
	n=2,
	filter_stops=True,
	filter_nums=True,
	filter_punct=True,
	)
	)

	def tri_gramchunker(self):
	"""
	Chunk the text into trigrams.

	Returns:
	List[str]: A list of trigrams.
	"""
	return list(
	textacy.extract.basics.ngrams(
	self.text_doc,
	n=3,
	filter_stops=True,
	filter_nums=True,
	filter_punct=True,
	)
	)