ktoan911's picture
Upload folder using huggingface_hub
398a289 verified
raw
history blame contribute delete
408 Bytes
import re
def normalize_text(text):
"""Normalize text for deduplication: lowercase, remove punctuation, extra spaces"""
if not isinstance(text, str):
return str(text)
# Lowercase
text = text.lower()
# Remove punctuation (keep alphanumeric and spaces)
text = re.sub(r"[^\w\s]", "", text)
# Remove extra spaces
text = re.sub(r"\s+", " ", text).strip()
return text