Text Classification
Scikit-learn
ONNX
Spam
Spam-Categoriser
File size: 947 Bytes
0ec6609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import string

def preprocess_text(text: str) -> str:
    """
    Lightweight preprocessing:
    - lowercase
    - URL normalization
    - remove excessive repeated characters
    - strip punctuation
    """
    if not text:
        return ""
    
    # Lowercase
    text = text.lower()
    
    # URL normalization
    text = re.sub(r'https?://\S+|www\.\S+', ' [URL] ', text)
    
    # Remove excessive repeated characters (e.g., "freeeeee" -> "free")
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    # Handle spaced out characters (e.g., "F R E E" -> "FREE")
    # Only if they are single characters separated by spaces, and more than 2 in a row
    text = re.sub(r'\b(\w\s){2,}\w\b', lambda m: m.group().replace(' ', ''), text)
    
    # Strip punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text