File size: 3,756 Bytes
31f0e50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""

Text Preprocessing Module.



Provides text cleaning and normalization utilities for:

- Message sanitization

- Devanagari digit conversion

- Text normalization

"""

import re
from typing import Optional


def clean_text(text: str) -> str:
    """

    Clean and sanitize input text.

    

    Removes:

    - Extra whitespace

    - Control characters

    - Leading/trailing whitespace

    

    Args:

        text: Input text

        

    Returns:

        Cleaned text

    """
    if not text:
        return ""
    
    # Remove control characters (except newlines and tabs)
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text


def normalize_text(text: str, lowercase: bool = False) -> str:
    """

    Normalize text for processing.

    

    Args:

        text: Input text

        lowercase: Convert to lowercase if True

        

    Returns:

        Normalized text

    """
    text = clean_text(text)
    
    if lowercase:
        text = text.lower()
    
    # Convert Devanagari digits to ASCII
    text = convert_devanagari_digits(text)
    
    return text


def convert_devanagari_digits(text: str) -> str:
    """

    Convert Devanagari digits to ASCII digits.

    

    Args:

        text: Input text containing potential Devanagari digits

        

    Returns:

        Text with Devanagari digits converted to ASCII

    """
    devanagari_map = {
        "\u0966": "0",  # 
        "\u0967": "1",  # 
        "\u0968": "2",  # 
        "\u0969": "3",  # 
        "\u096A": "4",  # 
        "\u096B": "5",  # 
        "\u096C": "6",  # 
        "\u096D": "7",  # 
        "\u096E": "8",  # 
        "\u096F": "9",  # 
    }
    
    for devanagari, ascii_digit in devanagari_map.items():
        text = text.replace(devanagari, ascii_digit)
    
    return text


def truncate_text(text: str, max_length: int = 5000, suffix: str = "...") -> str:
    """

    Truncate text to maximum length.

    

    Args:

        text: Input text

        max_length: Maximum allowed length

        suffix: Suffix to add if truncated

        

    Returns:

        Truncated text

    """
    if len(text) <= max_length:
        return text
    
    return text[: max_length - len(suffix)] + suffix


def remove_urls(text: str) -> str:
    """

    Remove URLs from text.

    

    Args:

        text: Input text

        

    Returns:

        Text with URLs removed

    """
    url_pattern = r"https?://[^\s<>\"{}|\\^`\[\]]+"
    return re.sub(url_pattern, "", text)


def extract_numbers(text: str) -> list:
    """

    Extract all number sequences from text.

    

    Args:

        text: Input text

        

    Returns:

        List of number strings

    """
    # First convert Devanagari digits
    text = convert_devanagari_digits(text)
    
    # Extract digit sequences
    return re.findall(r"\d+", text)


def mask_sensitive_data(text: str) -> str:
    """

    Mask sensitive data in text for logging.

    

    Masks:

    - UPI IDs

    - Bank account numbers

    - Phone numbers

    

    Args:

        text: Input text

        

    Returns:

        Text with sensitive data masked

    """
    # Mask UPI IDs
    text = re.sub(r"\b[a-zA-Z0-9._-]+@[a-zA-Z]+\b", "[UPI_MASKED]", text)
    
    # Mask bank accounts (9-18 digits)
    text = re.sub(r"\b\d{9,18}\b", "[ACCOUNT_MASKED]", text)
    
    # Mask phone numbers
    text = re.sub(r"(?:\+91[\s-]?)?[6-9]\d{9}\b", "[PHONE_MASKED]", text)
    
    return text