Ridge-Classifier-Prohori / preprocess.py
squadgoals404's picture
Initial upload of Ridge-Prohori (Bangla SMS smishing classifier)
42e9ab0 verified
import re
cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)"
cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)"
url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE)
cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE)
def normalize_text(t: str) -> str:
s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t))
s = re.sub(url_pat, "<URL>", s)
return s.lower().strip()