Spaces:
Sleeping
Sleeping
File size: 1,296 Bytes
3c4cf68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import joblib
import re
import string
from nltk.corpus import stopwords
def load_model(model_path):
"""
Load a joblib model
Args:
- model_path (str): path to the model
Returns:
- model: loaded model
"""
model = joblib.load(model_path)
return model
# Set of English stopwords
stop_words = set(stopwords.words('english'))
def preprocess_text(text:str):
# Step 1: Lowercase
text = text.lower()
# Step 2: Strip extra whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Step 3: Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Step 4: Remove stopwords
text = ' '.join(word for word in text.split() if word not in stop_words)
# Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
text = re.sub(r'\d+', '', text) # Numbers
text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables
return text
|