Spaces:
Running
Running
File size: 5,137 Bytes
6c9b8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | """
PhilVerify β Text Preprocessor
Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
"""
import re
import string
import unicodedata
from dataclasses import dataclass, field
# ββ Filipino + English stopwords ββββββββββββββββββββββββββββββββββββββββββββββ
TAGALOG_STOPWORDS = {
"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
"dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
"lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
"kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
"ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
"hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
}
ENGLISH_STOPWORDS = {
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
"for", "of", "with", "by", "from", "is", "are", "was", "were",
"be", "been", "being", "have", "has", "had", "do", "does", "did",
"will", "would", "could", "should", "may", "might", "shall", "can",
"not", "no", "nor", "so", "yet", "both", "either", "neither",
"this", "that", "these", "those", "it", "its", "i", "me", "my",
"we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
}
ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS
# ββ Patterns ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_URL_PATTERN = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
_MENTION_PATTERN = re.compile(r"@\w+")
_HASHTAG_PATTERN = re.compile(r"#\w+")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}") # "graaabe" β "grabe"
_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
_WHITESPACE_PATTERN = re.compile(r"\s+")
# Emoji removal via unicode category
def _remove_emojis(text: str) -> str:
return "".join(
ch for ch in text
if not unicodedata.category(ch).startswith("So") # Symbol, Other
and unicodedata.category(ch) not in ("Mn",) # Modifier letters
)
@dataclass
class PreprocessResult:
original: str
cleaned: str
normalized: str
tokens: list[str] = field(default_factory=list)
filtered_tokens: list[str] = field(default_factory=list)
char_count: int = 0
word_count: int = 0
class TextPreprocessor:
"""
Multi-step text cleaner for Tagalog / English / Taglish content.
Pipeline:
1. strip_html β remove HTML tags
2. strip_urls β remove hyperlinks
3. strip_mentions β remove @user
4. strip_hashtags β remove #tag text (keep token)
5. strip_emojis β remove Unicode emoji
6. lowercase β normalize case
7. normalize_chars β collapse repeated chars, excessive !??
8. strip_punct β remove punctuation except apostrophe
9. tokenize β split on whitespace
10. remove_stopwords β drop EN + TL stopwords
"""
def clean(self, text: str) -> str:
"""Steps 1-6: structural cleaning."""
text = _HTML_TAG_PATTERN.sub(" ", text)
text = _URL_PATTERN.sub(" ", text)
text = _MENTION_PATTERN.sub(" ", text)
text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text) # Keep word, drop #
text = _remove_emojis(text)
text = text.lower()
return _WHITESPACE_PATTERN.sub(" ", text).strip()
def normalize(self, text: str) -> str:
"""Steps 7-8: character-level normalization."""
text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text) # "graaabe" β "graabe"
text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text) # "!!!" β "!"
# Keep apostrophes (di, 'di, hindi), remove other punct
text = "".join(
ch if ch not in string.punctuation or ch == "'" else " "
for ch in text
)
return _WHITESPACE_PATTERN.sub(" ", text).strip()
def tokenize(self, text: str) -> list[str]:
"""Step 9: whitespace tokenization."""
return [t for t in text.split() if len(t) > 1]
def remove_stopwords(self, tokens: list[str]) -> list[str]:
"""Step 10: remove EN + TL stopwords."""
return [t for t in tokens if t not in ALL_STOPWORDS]
def preprocess(self, text: str) -> PreprocessResult:
"""Run the full pipeline and return a structured result."""
cleaned = self.clean(text)
normalized = self.normalize(cleaned)
tokens = self.tokenize(normalized)
filtered = self.remove_stopwords(tokens)
return PreprocessResult(
original=text,
cleaned=cleaned,
normalized=normalized,
tokens=tokens,
filtered_tokens=filtered,
char_count=len(normalized),
word_count=len(tokens),
)
|