Spaces:
Sleeping
Sleeping
| """ | |
| Text preprocessing pipeline for Nigerian English/Pidgin. | |
| Design principles: | |
| - Preserve linguistic features of Nigerian Pidgin (slang, contractions, code-switching) | |
| - Remove noise (URLs, usernames) that don't contribute to language modeling | |
| - Minimal normalization to avoid losing dialectal patterns | |
| """ | |
| import re | |
| from typing import List | |
| # Special tokens for sentence boundaries | |
| START_TOKEN = "<s>" | |
| END_TOKEN = "</s>" | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean text while preserving Nigerian Pidgin features. | |
| Operations: | |
| 1. Lowercase (case doesn't matter for prediction) | |
| 2. Remove URLs | |
| 3. Remove @usernames (Twitter-style) | |
| 4. Normalize whitespace | |
| Preserved: | |
| - Contractions (don't, I'm, na'm) | |
| - Slang (abi, sha, sef) | |
| - Code-switching patterns | |
| - Pidgin grammar structures | |
| Args: | |
| text: Raw text string. | |
| Returns: | |
| Cleaned text string. | |
| """ | |
| # Lowercase | |
| text = text.lower() | |
| # Remove URLs | |
| text = re.sub(r'https?://\S+', '', text) | |
| text = re.sub(r'www\.\S+', '', text) | |
| # Remove @usernames | |
| text = re.sub(r'@\w+', '', text) | |
| # Remove hashtags but keep the word | |
| text = re.sub(r'#(\w+)', r'\1', text) | |
| # Normalize whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| text = text.strip() | |
| return text | |
| def tokenize(text: str) -> List[str]: | |
| """ | |
| Word-level tokenization for Nigerian Pidgin. | |
| Handles: | |
| - Standard word boundaries | |
| - Punctuation as separate tokens | |
| - Preserves contractions as single tokens | |
| Args: | |
| text: Cleaned text string. | |
| Returns: | |
| List of tokens. | |
| """ | |
| # Split on whitespace first | |
| words = text.split() | |
| tokens = [] | |
| for word in words: | |
| # Handle punctuation attached to words | |
| # Keep contractions together (don't, I'm, etc.) | |
| # Strip leading punctuation | |
| while word and word[0] in '.,!?;:"\'-([{': | |
| if word[0] not in "'": # Keep leading apostrophe for contractions | |
| tokens.append(word[0]) | |
| word = word[1:] | |
| # Strip trailing punctuation | |
| trailing = [] | |
| while word and word[-1] in '.,!?;:"\'-)]}"': | |
| if word[-1] not in "'": # Keep trailing apostrophe for contractions | |
| trailing.insert(0, word[-1]) | |
| word = word[:-1] | |
| if word: | |
| tokens.append(word) | |
| tokens.extend(trailing) | |
| return tokens | |
| def preprocess_text(text: str) -> List[str]: | |
| """ | |
| Full preprocessing pipeline: clean + tokenize. | |
| Args: | |
| text: Raw text string. | |
| Returns: | |
| List of tokens. | |
| """ | |
| cleaned = clean_text(text) | |
| tokens = tokenize(cleaned) | |
| return tokens | |
| def add_sentence_markers(tokens: List[str]) -> List[str]: | |
| """ | |
| Add start/end markers for sentence boundary modeling. | |
| For trigram models, we need context at sentence boundaries. | |
| We add two start tokens to provide full context for the first word. | |
| Args: | |
| tokens: List of tokens from a sentence. | |
| Returns: | |
| Tokens with boundary markers. | |
| """ | |
| if not tokens: | |
| return [] | |
| return [START_TOKEN, START_TOKEN] + tokens + [END_TOKEN] | |
| def preprocess_corpus(texts: List[str]) -> List[List[str]]: | |
| """ | |
| Preprocess entire corpus for language model training. | |
| Args: | |
| texts: List of raw text strings. | |
| Returns: | |
| List of tokenized sentences with boundary markers. | |
| """ | |
| processed = [] | |
| for text in texts: | |
| tokens = preprocess_text(text) | |
| if tokens: # Skip empty results | |
| marked = add_sentence_markers(tokens) | |
| processed.append(marked) | |
| return processed | |
| if __name__ == "__main__": | |
| # Test preprocessing on Nigerian Pidgin examples | |
| test_texts = [ | |
| "I dey go market, you wan follow?", | |
| "That guy na correct person sha @handle https://example.com", | |
| "Wetin you dey do? Abi you no sabi?", | |
| "E don happen before, no be today matter", | |
| "How far? Everything dey go well?", | |
| ] | |
| print("Preprocessing Examples:\n") | |
| for text in test_texts: | |
| tokens = preprocess_text(text) | |
| marked = add_sentence_markers(tokens) | |
| print(f"Original: {text}") | |
| print(f"Tokens: {tokens}") | |
| print(f"Marked: {marked}") | |
| print() | |