customer-support-agent / src /data /preprocessing.py
pro580's picture
Fix rate limiter to use X-Forwarded-For header behind HF proxy
e323466
Raw
History Blame Contribute Delete
997 Bytes
"""Text preprocessing utilities for customer support intent classification."""
import re
from typing import List
import numpy as np
def clean_text(text: str) -> str:
"""Lowercase, strip non-ASCII, and normalize whitespace. Punctuation is preserved."""
if not isinstance(text, str):
text = str(text)
text = text.lower()
text = text.encode("ascii", errors="ignore").decode("ascii")
text = re.sub(r"\s+", " ", text).strip()
return text
def clean_texts(texts: List[str]) -> List[str]:
"""Apply clean_text to each string in texts."""
return [clean_text(t) for t in texts]
def set_global_seeds(seed: int = 42) -> None:
"""Set random seeds for reproducibility across numpy, Python random, and torch."""
import random
random.seed(seed)
np.random.seed(seed)
try:
import torch
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
except ImportError:
pass