Initial upload of MNB-Prohori (Bangla SMS smishing classifier)
Browse files- README.md +9 -0
- id2label.json +5 -0
- label2id.json +5 -0
- multinomial_nb_model.joblib +3 -0
- preprocess.py +11 -0
- requirements.txt +4 -0
- tfidf_char.joblib +3 -0
- tfidf_word.joblib +3 -0
README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MNB-Prohori (Bangla SMS smishing classifier)
|
| 2 |
+
|
| 3 |
+
- **Model**: Multinomial Naive Bayes on TF-IDF (word 1–2g, char 3–5g)
|
| 4 |
+
- **Labels**: ['normal', 'promo', 'smish']
|
| 5 |
+
- **Files**: `multinomial_nb_model.joblib`, `tfidf_word.joblib`, `tfidf_char.joblib`,
|
| 6 |
+
`preprocess.py`, `id2label.json`, `label2id.json`
|
| 7 |
+
|
| 8 |
+
## Quick Inference (Python)
|
| 9 |
+
See code in this repo description.
|
id2label.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "normal",
|
| 3 |
+
"1": "promo",
|
| 4 |
+
"2": "smish"
|
| 5 |
+
}
|
label2id.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"normal": 0,
|
| 3 |
+
"promo": 1,
|
| 4 |
+
"smish": 2
|
| 5 |
+
}
|
multinomial_nb_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:720f511b908ef6ddbef84d0fe8c20543b4ba05e71211d5042eec76319f30b6be
|
| 3 |
+
size 1733503
|
preprocess.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)"
|
| 4 |
+
cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)"
|
| 5 |
+
url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE)
|
| 6 |
+
cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE)
|
| 7 |
+
|
| 8 |
+
def normalize_text(t: str) -> str:
|
| 9 |
+
s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t))
|
| 10 |
+
s = re.sub(url_pat, "<URL>", s)
|
| 11 |
+
return s.lower().strip()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scikit-learn
|
| 2 |
+
scipy
|
| 3 |
+
numpy
|
| 4 |
+
joblib
|
tfidf_char.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b8411d5dbcd8531122704823430ddc8afa8041653dc74505a9cbf7f5f8822114
|
| 3 |
+
size 1133805
|
tfidf_word.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:109c980c36cb5bb1c55f8d4ea1a9fd859fa850b1554566fd9b806e7e859098c9
|
| 3 |
+
size 242572
|