squadgoals404 commited on
Commit
42e9ab0
·
verified ·
1 Parent(s): 5616273

Initial upload of Ridge-Prohori (Bangla SMS smishing classifier)

Browse files
README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Ridge-Prohori (Bangla Smishing Classifier)
2
+
3
+ **RidgeClassifier (L2-regularized linear model)** on TF-IDF **word(1–2)** + **char_wb(3–5)**.
4
+ Preprocessing: cue-before-URL → `<LINK_CUE>`, URLs → `<URL>`, lowercase.
id2label.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "0": "normal",
3
+ "1": "promo",
4
+ "2": "smish"
5
+ }
label2id.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "normal": 0,
3
+ "promo": 1,
4
+ "smish": 2
5
+ }
preprocess.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ cue_words_en = r"(check|click|visit|tap|verify|open|login|log\s*in|see|confirm|update|activate)"
4
+ cue_words_bn = r"(চেক|ক্লিক|ভিজিট|ট্যাপ|যাচাই|লগইন|লগ\s*ইন|দেখুন|আপডেট|অ্যাকটিভেট|নিশ্চিত)"
5
+ url_pat = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9.-]+\.[A-Za-z]{2,}\S*)", re.IGNORECASE)
6
+ cue_before_url_pat = re.compile(rf"(\b{cue_words_en}\b|\b{cue_words_bn}\b)\s*(?={url_pat.pattern})", re.IGNORECASE)
7
+
8
+ def normalize_text(t: str) -> str:
9
+ s = re.sub(cue_before_url_pat, "<LINK_CUE> ", str(t))
10
+ s = re.sub(url_pat, "<URL>", s)
11
+ return s.lower().strip()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn>=1.2
2
+ scipy
3
+ numpy
4
+ joblib
ridge_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff91406f0141931d3b0910dab95e530584b5261ff3cc54202b606084a60fe66f
3
+ size 1135656
tfidf_char.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aca3bc916fcd9cc4b93cb85f7900bc59434805664a4f9afff89b71ec22cf375
3
+ size 902539
tfidf_word.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b0abae11775f89f228755d7cc1772b920a099950fbb6cee225aad68b890948
3
+ size 144671