File size: 525 Bytes
b170a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import re

def preprocess(text: str) -> str:
    """
    Clean complaint text before feeding to HateBERT.
    Preserves casing — HateBERT is case-aware for offensive detection.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters but keep punctuation (! ? . help with context)
    text = re.sub(r'[^\w\s!?.,]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text