File size: 5,910 Bytes
c25dcd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import m2cgen as m2c
import re

print("Loading dataset alanjoshua2005/india-spam-sms...")
dataset = load_dataset("alanjoshua2005/india-spam-sms")

# Usually dataset has 'train' split which is a list of dicts. Let's make a dataframe
df = pd.DataFrame(dataset['train'])
# Let's see columns
print(df.columns)
# Typically it has 'v1' mapping to label and 'v2' mapping to text, or 'label' and 'text', or 'Message' and 'Category'
if 'Message' in df.columns and 'Category' in df.columns:
    df['text'] = df['Message']
    df['label'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
elif 'text' in df.columns and 'label' in df.columns:
    pass
elif 'v1' in df.columns and 'v2' in df.columns:
    df['text'] = df['v2']
    df['label'] = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)
else:
    # try generically
    for col in df.columns:
        if 'text' in col.lower() or 'msg' in col.lower() or 'message' in col.lower():
            df['text'] = df[col]
        if 'label' in col.lower() or 'spam' in col.lower() or 'category' in col.lower():
            df['label'] = df[col]
            if df['label'].dtype == object:
                df['label'] = df['label'].apply(lambda x: 1 if 'spam' in str(x).lower() else 0)

print(df.head())

def extract_features(row):
    text = str(row['text']).lower()
    
    # 1. senderAgeScore (mock context)
    # 2. senderInContacts
    # 3. senderMessageCount
    # 4. senderCarrierScore
    # We will simulate these based on label, because text doesn't have it.
    is_spam = row['label'] == 1
    
    senderAgeScore = np.random.uniform(0.0, 0.3) if is_spam else np.random.uniform(0.5, 1.0)
    senderInContacts = 0.0 if is_spam else float(np.random.rand() > 0.2)
    senderMessageCount = np.random.randint(0, 3) if is_spam else np.random.randint(10, 100)
    senderCarrierScore = np.random.uniform(0.0, 0.4) if is_spam else np.random.uniform(0.6, 1.0)
    
    # 5. urgencyScore (0.0 to 1.0)
    urgent_words = ['urgent', 'hurry', 'limited', 'immediate', 'action required', 'alert', 'warning', 'expires', 'claim']
    urgencyScore = sum([1 for w in urgent_words if w in text]) / len(urgent_words)
    
    # 6. domainCount
    domain_matches = re.findall(r"(http[s]?://|www\.)[^\s]+", text)
    domainCount = len(domain_matches)
    
    # 7. domainLevenshtein (simulated, no actual domain extraction here)
    domainLevenshtein = np.random.uniform(0.2, 0.9) if (is_spam and domainCount > 0) else 1.0
    
    # 8. containsMoneyAction
    money_words = ['rs', '₹', 'free', 'win', 'prize', 'cash', 'money', 'credit', 'loan', 'offer', 'pay', 'collect', 'bank']
    containsMoneyAction = float(any(w in text for w in money_words))
    
    # 9. capsRatio
    orig_text = str(row['text'])
    caps_count = sum(1 for c in orig_text if c.isupper())
    letters_count = sum(1 for c in orig_text if c.isalpha())
    capsRatio = caps_count / letters_count if letters_count > 0 else 0.0
    
    # 10. hourOfDay (spam often sent at odd hours)
    hourOfDay = float(np.random.choice([0,1,2,3,4,22,23])) if is_spam else float(np.random.randint(8, 20))
    
    # 11. isWeekend
    isWeekend = float(np.random.rand() > (0.3 if is_spam else 0.8))
    
    # 12. isFestivalPeriod
    isFestivalPeriod = float(np.random.rand() > 0.8)
    
    # 13. recentScamSenderSim
    recentScamSenderSim = np.random.uniform(0.5, 1.0) if is_spam else np.random.uniform(0.0, 0.2)
    
    return [
        senderAgeScore, senderInContacts, float(senderMessageCount), senderCarrierScore,
        urgencyScore, float(domainCount), domainLevenshtein, containsMoneyAction, capsRatio,
        hourOfDay, isWeekend, isFestivalPeriod, recentScamSenderSim
    ]

print("Extracting features...")
X = np.array([extract_features(row) for _, row in df.iterrows()])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Random Forest...")
clf = RandomForestClassifier(n_estimators=15, max_depth=5, random_state=42)
clf.fit(X_train, y_train)

print("Evaluating...")
preds = clf.predict(X_test)
print(classification_report(y_test, preds))

print("Exporting model to Dart with m2cgen...")
dart_code = m2c.export_to_dart(clf)

# Write to our model file
file_path = "lib/kavacha/models/random_forest_model.dart"
header = """// Auto-generated using m2cgen
import 'sms_feature_vector.dart';

class RandomForestModel {
  double score(SmsFeatureVector features) {
    List<double> input = [
      features.senderAgeScore,
      features.senderInContacts ? 1.0 : 0.0,
      features.senderMessageCount.toDouble(),
      features.senderCarrierScore,
      features.urgencyScore,
      features.domainCount.toDouble(),
      features.domainLevenshtein,
      features.containsMoneyAction ? 1.0 : 0.0,
      features.capsRatio,
      features.hourOfDay.toDouble(),
      features.isWeekend ? 1.0 : 0.0,
      features.isFestivalPeriod ? 1.0 : 0.0,
      features.recentScamSenderSim
    ];
    List<double> output = score_features(input);
    // output[1] is the probability of class 1 (spam)
    return output[1];
  }

"""

with open(file_path, "w") as f:
    f.write(header)
    # The generated code usually has a class or double[] score(double[] input)
    # We will just write it straight out, maybe replace the class wrapper
    code = dart_code.replace("double[]", "List<double>").replace("public double[] score", "List<double> score_features")
    # m2cgen generates a single function / class. We strip "public class Model {" etc if present.
    # Luckily Dart export is typically `List<double> score(List<double> input)`
    f.write(code)
    f.write("\n}\n")

print("Done writing to " + file_path)