Spaces:
Sleeping
Sleeping
File size: 408 Bytes
398a289 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import re
def normalize_text(text):
"""Normalize text for deduplication: lowercase, remove punctuation, extra spaces"""
if not isinstance(text, str):
return str(text)
# Lowercase
text = text.lower()
# Remove punctuation (keep alphanumeric and spaces)
text = re.sub(r"[^\w\s]", "", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
return text
|