File size: 947 Bytes

0ec6609

import re
import string

def preprocess_text(text: str) -> str:
    """
    Lightweight preprocessing:
    - lowercase
    - URL normalization
    - remove excessive repeated characters
    - strip punctuation
    """
    if not text:
        return ""
    
    # Lowercase
    text = text.lower()
    
    # URL normalization
    text = re.sub(r'https?://\S+|www\.\S+', ' [URL] ', text)
    
    # Remove excessive repeated characters (e.g., "freeeeee" -> "free")
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    # Handle spaced out characters (e.g., "F R E E" -> "FREE")
    # Only if they are single characters separated by spaces, and more than 2 in a row
    text = re.sub(r'\b(\w\s){2,}\w\b', lambda m: m.group().replace(' ', ''), text)
    
    # Strip punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text