| import re | |
| cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)" | |
| cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)" | |
| url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE) | |
| cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE) | |
| def normalize_text(t: str) -> str: | |
| s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t)) | |
| s = re.sub(url_pat, "<URL>", s) | |
| return s.lower().strip() | |