File size: 4,465 Bytes
ad18db6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Text preprocessing pipeline for Nigerian English/Pidgin.

Design principles:
- Preserve linguistic features of Nigerian Pidgin (slang, contractions, code-switching)
- Remove noise (URLs, usernames) that don't contribute to language modeling
- Minimal normalization to avoid losing dialectal patterns
"""

import re
from typing import List


# Special tokens for sentence boundaries
START_TOKEN = "<s>"
END_TOKEN = "</s>"


def clean_text(text: str) -> str:
    """
    Clean text while preserving Nigerian Pidgin features.
    
    Operations:
    1. Lowercase (case doesn't matter for prediction)
    2. Remove URLs
    3. Remove @usernames (Twitter-style)
    4. Normalize whitespace
    
    Preserved:
    - Contractions (don't, I'm, na'm)
    - Slang (abi, sha, sef)
    - Code-switching patterns
    - Pidgin grammar structures
    
    Args:
        text: Raw text string.
        
    Returns:
        Cleaned text string.
    """
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    
    # Remove @usernames
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags but keep the word
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    return text


def tokenize(text: str) -> List[str]:
    """
    Word-level tokenization for Nigerian Pidgin.
    
    Handles:
    - Standard word boundaries
    - Punctuation as separate tokens
    - Preserves contractions as single tokens
    
    Args:
        text: Cleaned text string.
        
    Returns:
        List of tokens.
    """
    # Split on whitespace first
    words = text.split()
    
    tokens = []
    for word in words:
        # Handle punctuation attached to words
        # Keep contractions together (don't, I'm, etc.)
        
        # Strip leading punctuation
        while word and word[0] in '.,!?;:"\'-([{':
            if word[0] not in "'":  # Keep leading apostrophe for contractions
                tokens.append(word[0])
            word = word[1:]
        
        # Strip trailing punctuation
        trailing = []
        while word and word[-1] in '.,!?;:"\'-)]}"':
            if word[-1] not in "'":  # Keep trailing apostrophe for contractions
                trailing.insert(0, word[-1])
            word = word[:-1]
        
        if word:
            tokens.append(word)
        
        tokens.extend(trailing)
    
    return tokens


def preprocess_text(text: str) -> List[str]:
    """
    Full preprocessing pipeline: clean + tokenize.
    
    Args:
        text: Raw text string.
        
    Returns:
        List of tokens.
    """
    cleaned = clean_text(text)
    tokens = tokenize(cleaned)
    return tokens


def add_sentence_markers(tokens: List[str]) -> List[str]:
    """
    Add start/end markers for sentence boundary modeling.
    
    For trigram models, we need context at sentence boundaries.
    We add two start tokens to provide full context for the first word.
    
    Args:
        tokens: List of tokens from a sentence.
        
    Returns:
        Tokens with boundary markers.
    """
    if not tokens:
        return []
    return [START_TOKEN, START_TOKEN] + tokens + [END_TOKEN]


def preprocess_corpus(texts: List[str]) -> List[List[str]]:
    """
    Preprocess entire corpus for language model training.
    
    Args:
        texts: List of raw text strings.
        
    Returns:
        List of tokenized sentences with boundary markers.
    """
    processed = []
    for text in texts:
        tokens = preprocess_text(text)
        if tokens:  # Skip empty results
            marked = add_sentence_markers(tokens)
            processed.append(marked)
    return processed


if __name__ == "__main__":
    # Test preprocessing on Nigerian Pidgin examples
    test_texts = [
        "I dey go market, you wan follow?",
        "That guy na correct person sha @handle https://example.com",
        "Wetin you dey do? Abi you no sabi?",
        "E don happen before, no be today matter",
        "How far? Everything dey go well?",
    ]
    
    print("Preprocessing Examples:\n")
    for text in test_texts:
        tokens = preprocess_text(text)
        marked = add_sentence_markers(tokens)
        print(f"Original: {text}")
        print(f"Tokens:   {tokens}")
        print(f"Marked:   {marked}")
        print()