Initial upload of Ridge-Prohori (Bangla SMS smishing classifier)
Browse files- README.md +4 -0
- id2label.json +5 -0
- label2id.json +5 -0
- preprocess.py +11 -0
- requirements.txt +4 -0
- ridge_model.joblib +3 -0
- tfidf_char.joblib +3 -0
- tfidf_word.joblib +3 -0
README.md
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ridge-Prohori (Bangla Smishing Classifier)
|
| 2 |
+
|
| 3 |
+
**RidgeClassifier (L2-regularized linear model)** on TF-IDF **word(1–2)** + **char_wb(3–5)**.
|
| 4 |
+
Preprocessing: cue-before-URL → `<LINK_CUE>`, URLs → `<URL>`, lowercase.
|
id2label.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "normal",
|
| 3 |
+
"1": "promo",
|
| 4 |
+
"2": "smish"
|
| 5 |
+
}
|
label2id.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"normal": 0,
|
| 3 |
+
"promo": 1,
|
| 4 |
+
"smish": 2
|
| 5 |
+
}
|
preprocess.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)"
|
| 4 |
+
cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)"
|
| 5 |
+
url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE)
|
| 6 |
+
cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE)
|
| 7 |
+
|
| 8 |
+
def normalize_text(t: str) -> str:
|
| 9 |
+
s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t))
|
| 10 |
+
s = re.sub(url_pat, "<URL>", s)
|
| 11 |
+
return s.lower().strip()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scikit-learn>=1.2
|
| 2 |
+
scipy
|
| 3 |
+
numpy
|
| 4 |
+
joblib
|
ridge_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff91406f0141931d3b0910dab95e530584b5261ff3cc54202b606084a60fe66f
|
| 3 |
+
size 1135656
|
tfidf_char.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7aca3bc916fcd9cc4b93cb85f7900bc59434805664a4f9afff89b71ec22cf375
|
| 3 |
+
size 902539
|
tfidf_word.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16b0abae11775f89f228755d7cc1772b920a099950fbb6cee225aad68b890948
|
| 3 |
+
size 144671
|