File size: 3,756 Bytes
31f0e50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | """
Text Preprocessing Module.
Provides text cleaning and normalization utilities for:
- Message sanitization
- Devanagari digit conversion
- Text normalization
"""
import re
from typing import Optional
def clean_text(text: str) -> str:
"""
Clean and sanitize input text.
Removes:
- Extra whitespace
- Control characters
- Leading/trailing whitespace
Args:
text: Input text
Returns:
Cleaned text
"""
if not text:
return ""
# Remove control characters (except newlines and tabs)
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)
# Normalize whitespace
text = re.sub(r"\s+", " ", text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def normalize_text(text: str, lowercase: bool = False) -> str:
"""
Normalize text for processing.
Args:
text: Input text
lowercase: Convert to lowercase if True
Returns:
Normalized text
"""
text = clean_text(text)
if lowercase:
text = text.lower()
# Convert Devanagari digits to ASCII
text = convert_devanagari_digits(text)
return text
def convert_devanagari_digits(text: str) -> str:
"""
Convert Devanagari digits to ASCII digits.
Args:
text: Input text containing potential Devanagari digits
Returns:
Text with Devanagari digits converted to ASCII
"""
devanagari_map = {
"\u0966": "0", #
"\u0967": "1", #
"\u0968": "2", #
"\u0969": "3", #
"\u096A": "4", #
"\u096B": "5", #
"\u096C": "6", #
"\u096D": "7", #
"\u096E": "8", #
"\u096F": "9", #
}
for devanagari, ascii_digit in devanagari_map.items():
text = text.replace(devanagari, ascii_digit)
return text
def truncate_text(text: str, max_length: int = 5000, suffix: str = "...") -> str:
"""
Truncate text to maximum length.
Args:
text: Input text
max_length: Maximum allowed length
suffix: Suffix to add if truncated
Returns:
Truncated text
"""
if len(text) <= max_length:
return text
return text[: max_length - len(suffix)] + suffix
def remove_urls(text: str) -> str:
"""
Remove URLs from text.
Args:
text: Input text
Returns:
Text with URLs removed
"""
url_pattern = r"https?://[^\s<>\"{}|\\^`\[\]]+"
return re.sub(url_pattern, "", text)
def extract_numbers(text: str) -> list:
"""
Extract all number sequences from text.
Args:
text: Input text
Returns:
List of number strings
"""
# First convert Devanagari digits
text = convert_devanagari_digits(text)
# Extract digit sequences
return re.findall(r"\d+", text)
def mask_sensitive_data(text: str) -> str:
"""
Mask sensitive data in text for logging.
Masks:
- UPI IDs
- Bank account numbers
- Phone numbers
Args:
text: Input text
Returns:
Text with sensitive data masked
"""
# Mask UPI IDs
text = re.sub(r"\b[a-zA-Z0-9._-]+@[a-zA-Z]+\b", "[UPI_MASKED]", text)
# Mask bank accounts (9-18 digits)
text = re.sub(r"\b\d{9,18}\b", "[ACCOUNT_MASKED]", text)
# Mask phone numbers
text = re.sub(r"(?:\+91[\s-]?)?[6-9]\d{9}\b", "[PHONE_MASKED]", text)
return text
|