File size: 408 Bytes
398a289
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import re


def normalize_text(text):
    """Normalize text for deduplication: lowercase, remove punctuation, extra spaces"""
    if not isinstance(text, str):
        return str(text)
    # Lowercase
    text = text.lower()
    # Remove punctuation (keep alphanumeric and spaces)
    text = re.sub(r"[^\w\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text