import re cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)" cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)" url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE) cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE) def normalize_text(t: str) -> str: s = re.sub(cue_before_url_pat, " ", str(t)) s = re.sub(url_pat, "", s) return s.lower().strip()