Spaces:

DeepActionPotential
/

FineTextTector

Sleeping

FineTextTector / utils.py

Update utils.py

20b3b12 verified 8 months ago

1.38 kB


	import joblib

	import re
	import string

	import nltk

	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords', quiet=True)


	from nltk.corpus import stopwords




	def load_model(model_path):
	"""
	Load a joblib model

	Args:
	- model_path (str): path to the model

	Returns:
	- model: loaded model
	"""
	model = joblib.load(model_path)
	return model



	# Set of English stopwords
	stop_words = set(stopwords.words('english'))

	def preprocess_text(text:str):
	# Step 1: Lowercase
	text = text.lower()

	# Step 2: Strip extra whitespace
	text = re.sub(r'\s+', ' ', text.strip())

	# Step 3: Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))

	# Step 4: Remove stopwords
	text = ' '.join(word for word in text.split() if word not in stop_words)

	# Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
	text = re.sub(r'http\S+\|www\.\S+', '', text) # URLs
	text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
	text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
	text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
	text = re.sub(r'\d+', '', text) # Numbers
	text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables

	return text