JobShield-AI / utils /text_processing.py
shravanijadhav264's picture
Initial clean commit
984c70c
# Coding by Samitha Randika | https://www.linkedin.com/in/samitha-randika-edirisinghe-b3a68a2b6 #
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
def clean_text(text):
if not isinstance(text, str):
return ""
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove special characters
text = re.sub(r'[^\w\s]', '', text)
# Convert to lowercase
text = text.lower()
# Remove numbers
text = re.sub(r'\d+', '', text)
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_text(text):
text = clean_text(text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
return [word for word in tokens if word not in stop_words and len(word) > 2]