File size: 997 Bytes
e323466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Text preprocessing utilities for customer support intent classification."""

import re
from typing import List

import numpy as np


def clean_text(text: str) -> str:
    """Lowercase, strip non-ASCII, and normalize whitespace. Punctuation is preserved."""
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = text.encode("ascii", errors="ignore").decode("ascii")
    text = re.sub(r"\s+", " ", text).strip()
    return text


def clean_texts(texts: List[str]) -> List[str]:
    """Apply clean_text to each string in texts."""
    return [clean_text(t) for t in texts]


def set_global_seeds(seed: int = 42) -> None:
    """Set random seeds for reproducibility across numpy, Python random, and torch."""
    import random

    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch

        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass