File size: 1,286 Bytes
097428b
 
 
 
 
c296037
097428b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

import joblib

import re
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')


def load_model(model_path):
    """
    Load a joblib model

    Args:
    - model_path (str): path to the model

    Returns:
    - model: loaded model
    """
    model = joblib.load(model_path)
    return model



# Set of English stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text:str):
    # Step 1: Lowercase
    text = text.lower()

    # Step 2: Strip extra whitespace
    text = re.sub(r'\s+', ' ', text.strip())

    # Step 3: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Step 4: Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
    text = re.sub(r'http\S+|www\.\S+', '', text)       # URLs
    text = re.sub(r'\S+@\S+\.\S+', '', text)           # Emails
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)         # Hashtags
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)         # Mentions
    text = re.sub(r'\d+', '', text)                    # Numbers
    text = ''.join(ch for ch in text if ch.isprintable())  # Non-printables

    return text