squadgoals404 commited on
Commit
6ff1e0a
·
verified ·
1 Parent(s): bb859ed

Initial upload of MNB-Prohori (Bangla SMS smishing classifier)

Browse files
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # MNB-Prohori (Bangla SMS smishing classifier)
2
+
3
+ - **Model**: Multinomial Naive Bayes on TF-IDF (word 1–2g, char 3–5g)
4
+ - **Labels**: ['normal', 'promo', 'smish']
5
+ - **Files**: `multinomial_nb_model.joblib`, `tfidf_word.joblib`, `tfidf_char.joblib`,
6
+ `preprocess.py`, `id2label.json`, `label2id.json`
7
+
8
+ ## Quick Inference (Python)
9
+ See code in this repo description.
id2label.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "0": "normal",
3
+ "1": "promo",
4
+ "2": "smish"
5
+ }
label2id.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "normal": 0,
3
+ "promo": 1,
4
+ "smish": 2
5
+ }
multinomial_nb_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:720f511b908ef6ddbef84d0fe8c20543b4ba05e71211d5042eec76319f30b6be
3
+ size 1733503
preprocess.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)"
4
+ cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)"
5
+ url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE)
6
+ cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE)
7
+
8
+ def normalize_text(t: str) -> str:
9
+ s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t))
10
+ s = re.sub(url_pat, "<URL>", s)
11
+ return s.lower().strip()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn
2
+ scipy
3
+ numpy
4
+ joblib
tfidf_char.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8411d5dbcd8531122704823430ddc8afa8041653dc74505a9cbf7f5f8822114
3
+ size 1133805
tfidf_word.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109c980c36cb5bb1c55f8d4ea1a9fd859fa850b1554566fd9b806e7e859098c9
3
+ size 242572