Spaces:

marchji2415
/

resumematcher

Sleeping

resumematcher / scripts /utils /Utils.py

March

first

46917c3 9 months ago

2.31 kB

	import re
	from uuid import uuid4

	import spacy

	# Load the English model
	nlp = spacy.load("en_core_web_md")

	REGEX_PATTERNS = {
	"email_pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
	"phone_pattern": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
	"link_pattern": r"\b(?:https?://\|www\.)\S+\b",
	}


	def generate_unique_id():
	"""
	Generate a unique ID and return it as a string.

	Returns:
	str: A string with a unique ID.
	"""
	return str(uuid4())


	class TextCleaner:
	"""
	A class for cleaning a text by removing specific patterns.
	"""

	def remove_emails_links(text):
	"""
	Clean the input text by removing specific patterns.

	Args:
	text (str): The input text to clean.

	Returns:
	str: The cleaned text.
	"""
	for pattern in REGEX_PATTERNS:
	text = re.sub(REGEX_PATTERNS[pattern], "", text)
	return text

	def clean_text(text):
	"""
	Clean the input text by removing specific patterns.

	Args:
	text (str): The input text to clean.

	Returns:
	str: The cleaned text.
	"""
	text = TextCleaner.remove_emails_links(text)
	doc = nlp(text)
	for token in doc:
	if token.pos_ == "PUNCT":
	text = text.replace(token.text, "")
	return str(text)

	def remove_stopwords(text):
	"""
	Clean the input text by removing stopwords.

	Args:
	text (str): The input text to clean.

	Returns:
	str: The cleaned text.
	"""
	doc = nlp(text)
	for token in doc:
	if token.is_stop:
	text = text.replace(token.text, "")
	return text


	class CountFrequency:

	def __init__(self, text):
	self.text = text
	self.doc = nlp(text)

	def count_frequency(self):
	"""
	Count the frequency of words in the input text.

	Returns:
	dict: A dictionary with the words as keys and the frequency as values.
	"""
	pos_freq = {}
	for token in self.doc:
	if token.pos_ in pos_freq:
	pos_freq[token.pos_] += 1
	else:
	pos_freq[token.pos_] = 1
	return pos_freq