DeepActionPotential commited on
Commit
097428b
·
verified ·
1 Parent(s): 9da9ecd

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +52 -50
utils.py CHANGED
@@ -1,50 +1,52 @@
1
-
2
- import joblib
3
-
4
- import re
5
- import string
6
- from nltk.corpus import stopwords
7
-
8
-
9
-
10
- def load_model(model_path):
11
- """
12
- Load a joblib model
13
-
14
- Args:
15
- - model_path (str): path to the model
16
-
17
- Returns:
18
- - model: loaded model
19
- """
20
- model = joblib.load(model_path)
21
- return model
22
-
23
-
24
-
25
- # Set of English stopwords
26
- stop_words = set(stopwords.words('english'))
27
-
28
- def preprocess_text(text:str):
29
- # Step 1: Lowercase
30
- text = text.lower()
31
-
32
- # Step 2: Strip extra whitespace
33
- text = re.sub(r'\s+', ' ', text.strip())
34
-
35
- # Step 3: Remove punctuation
36
- text = text.translate(str.maketrans('', '', string.punctuation))
37
-
38
- # Step 4: Remove stopwords
39
- text = ' '.join(word for word in text.split() if word not in stop_words)
40
-
41
- # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
42
- text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
43
- text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
44
- text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
45
- text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
46
- text = re.sub(r'\d+', '', text) # Numbers
47
- text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables
48
-
49
- return text
50
-
 
 
 
1
+
2
+ import joblib
3
+
4
+ import re
5
+ import string
6
+ from nltk.corpus import stopwords
7
+
8
+
9
+ nltk.download('stopwords')
10
+
11
+
12
+ def load_model(model_path):
13
+ """
14
+ Load a joblib model
15
+
16
+ Args:
17
+ - model_path (str): path to the model
18
+
19
+ Returns:
20
+ - model: loaded model
21
+ """
22
+ model = joblib.load(model_path)
23
+ return model
24
+
25
+
26
+
27
+ # Set of English stopwords
28
+ stop_words = set(stopwords.words('english'))
29
+
30
+ def preprocess_text(text:str):
31
+ # Step 1: Lowercase
32
+ text = text.lower()
33
+
34
+ # Step 2: Strip extra whitespace
35
+ text = re.sub(r'\s+', ' ', text.strip())
36
+
37
+ # Step 3: Remove punctuation
38
+ text = text.translate(str.maketrans('', '', string.punctuation))
39
+
40
+ # Step 4: Remove stopwords
41
+ text = ' '.join(word for word in text.split() if word not in stop_words)
42
+
43
+ # Step 5: Remove noise (URLs, emails, hashtags, mentions, numbers, non-printables)
44
+ text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
45
+ text = re.sub(r'\S+@\S+\.\S+', '', text) # Emails
46
+ text = re.sub(r'#[A-Za-z0-9_]+', '', text) # Hashtags
47
+ text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Mentions
48
+ text = re.sub(r'\d+', '', text) # Numbers
49
+ text = ''.join(ch for ch in text if ch.isprintable()) # Non-printables
50
+
51
+ return text
52
+