Spaces:
Sleeping
Sleeping
File size: 4,465 Bytes
ad18db6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """
Text preprocessing pipeline for Nigerian English/Pidgin.
Design principles:
- Preserve linguistic features of Nigerian Pidgin (slang, contractions, code-switching)
- Remove noise (URLs, usernames) that don't contribute to language modeling
- Minimal normalization to avoid losing dialectal patterns
"""
import re
from typing import List
# Special tokens for sentence boundaries
START_TOKEN = "<s>"
END_TOKEN = "</s>"
def clean_text(text: str) -> str:
"""
Clean text while preserving Nigerian Pidgin features.
Operations:
1. Lowercase (case doesn't matter for prediction)
2. Remove URLs
3. Remove @usernames (Twitter-style)
4. Normalize whitespace
Preserved:
- Contractions (don't, I'm, na'm)
- Slang (abi, sha, sef)
- Code-switching patterns
- Pidgin grammar structures
Args:
text: Raw text string.
Returns:
Cleaned text string.
"""
# Lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'www\.\S+', '', text)
# Remove @usernames
text = re.sub(r'@\w+', '', text)
# Remove hashtags but keep the word
text = re.sub(r'#(\w+)', r'\1', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
text = text.strip()
return text
def tokenize(text: str) -> List[str]:
"""
Word-level tokenization for Nigerian Pidgin.
Handles:
- Standard word boundaries
- Punctuation as separate tokens
- Preserves contractions as single tokens
Args:
text: Cleaned text string.
Returns:
List of tokens.
"""
# Split on whitespace first
words = text.split()
tokens = []
for word in words:
# Handle punctuation attached to words
# Keep contractions together (don't, I'm, etc.)
# Strip leading punctuation
while word and word[0] in '.,!?;:"\'-([{':
if word[0] not in "'": # Keep leading apostrophe for contractions
tokens.append(word[0])
word = word[1:]
# Strip trailing punctuation
trailing = []
while word and word[-1] in '.,!?;:"\'-)]}"':
if word[-1] not in "'": # Keep trailing apostrophe for contractions
trailing.insert(0, word[-1])
word = word[:-1]
if word:
tokens.append(word)
tokens.extend(trailing)
return tokens
def preprocess_text(text: str) -> List[str]:
"""
Full preprocessing pipeline: clean + tokenize.
Args:
text: Raw text string.
Returns:
List of tokens.
"""
cleaned = clean_text(text)
tokens = tokenize(cleaned)
return tokens
def add_sentence_markers(tokens: List[str]) -> List[str]:
"""
Add start/end markers for sentence boundary modeling.
For trigram models, we need context at sentence boundaries.
We add two start tokens to provide full context for the first word.
Args:
tokens: List of tokens from a sentence.
Returns:
Tokens with boundary markers.
"""
if not tokens:
return []
return [START_TOKEN, START_TOKEN] + tokens + [END_TOKEN]
def preprocess_corpus(texts: List[str]) -> List[List[str]]:
"""
Preprocess entire corpus for language model training.
Args:
texts: List of raw text strings.
Returns:
List of tokenized sentences with boundary markers.
"""
processed = []
for text in texts:
tokens = preprocess_text(text)
if tokens: # Skip empty results
marked = add_sentence_markers(tokens)
processed.append(marked)
return processed
if __name__ == "__main__":
# Test preprocessing on Nigerian Pidgin examples
test_texts = [
"I dey go market, you wan follow?",
"That guy na correct person sha @handle https://example.com",
"Wetin you dey do? Abi you no sabi?",
"E don happen before, no be today matter",
"How far? Everything dey go well?",
]
print("Preprocessing Examples:\n")
for text in test_texts:
tokens = preprocess_text(text)
marked = add_sentence_markers(tokens)
print(f"Original: {text}")
print(f"Tokens: {tokens}")
print(f"Marked: {marked}")
print()
|