Spaces:

shravanijadhav264
/

JobShield-AI

Sleeping

App Files Files Community

JobShield-AI / scripts /utils /text_processing.py

shravanijadhav264

Initial clean commit

984c70c 17 days ago

raw

history blame contribute delete

1.87 kB

	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import string

	nltk.download('punkt', quiet=True)
	nltk.download('stopwords', quiet=True)

	def clean_text(text):
	if not isinstance(text, str):
	return ""

	# Remove URLs
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text, flags=re.MULTILINE)
	# Remove special characters
	text = re.sub(r'[^\w\s]', '', text)
	# Convert to lowercase
	text = text.lower()
	# Remove numbers
	text = re.sub(r'\d+', '', text)
	# Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def tokenize_text(text, keep_numbers=False):
	"""Tokenize text with optional number preservation"""
	text = clean_text(text)

	# Handle special cases for job titles
	special_cases = {
	"system administrator": "system_administrator",
	"database administrator": "database_administrator",
	"web developer": "web_developer",
	"security analyst": "security_analyst",
	"data scientist": "data_scientist",
	"devops engineer": "devops_engineer",
	"cloud engineer": "cloud_engineer",
	"machine learning engineer": "machine_learning_engineer",
	"software engineer": "software_engineer"
	}

	for phrase, replacement in special_cases.items():
	text = text.replace(phrase, replacement)

	tokens = word_tokenize(text)
	stop_words = set(stopwords.words('english'))

	# Filter tokens
	filtered = []
	for word in tokens:
	if word in stop_words:
	continue
	if not keep_numbers and word.isdigit():
	continue
	if len(word) < 2:
	continue
	filtered.append(word)

	return filtered