Spaces:
Running
Running
File size: 5,910 Bytes
c25dcd7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import m2cgen as m2c
import re
print("Loading dataset alanjoshua2005/india-spam-sms...")
dataset = load_dataset("alanjoshua2005/india-spam-sms")
# Usually dataset has 'train' split which is a list of dicts. Let's make a dataframe
df = pd.DataFrame(dataset['train'])
# Let's see columns
print(df.columns)
# Typically it has 'v1' mapping to label and 'v2' mapping to text, or 'label' and 'text', or 'Message' and 'Category'
if 'Message' in df.columns and 'Category' in df.columns:
df['text'] = df['Message']
df['label'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
elif 'text' in df.columns and 'label' in df.columns:
pass
elif 'v1' in df.columns and 'v2' in df.columns:
df['text'] = df['v2']
df['label'] = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)
else:
# try generically
for col in df.columns:
if 'text' in col.lower() or 'msg' in col.lower() or 'message' in col.lower():
df['text'] = df[col]
if 'label' in col.lower() or 'spam' in col.lower() or 'category' in col.lower():
df['label'] = df[col]
if df['label'].dtype == object:
df['label'] = df['label'].apply(lambda x: 1 if 'spam' in str(x).lower() else 0)
print(df.head())
def extract_features(row):
text = str(row['text']).lower()
# 1. senderAgeScore (mock context)
# 2. senderInContacts
# 3. senderMessageCount
# 4. senderCarrierScore
# We will simulate these based on label, because text doesn't have it.
is_spam = row['label'] == 1
senderAgeScore = np.random.uniform(0.0, 0.3) if is_spam else np.random.uniform(0.5, 1.0)
senderInContacts = 0.0 if is_spam else float(np.random.rand() > 0.2)
senderMessageCount = np.random.randint(0, 3) if is_spam else np.random.randint(10, 100)
senderCarrierScore = np.random.uniform(0.0, 0.4) if is_spam else np.random.uniform(0.6, 1.0)
# 5. urgencyScore (0.0 to 1.0)
urgent_words = ['urgent', 'hurry', 'limited', 'immediate', 'action required', 'alert', 'warning', 'expires', 'claim']
urgencyScore = sum([1 for w in urgent_words if w in text]) / len(urgent_words)
# 6. domainCount
domain_matches = re.findall(r"(http[s]?://|www\.)[^\s]+", text)
domainCount = len(domain_matches)
# 7. domainLevenshtein (simulated, no actual domain extraction here)
domainLevenshtein = np.random.uniform(0.2, 0.9) if (is_spam and domainCount > 0) else 1.0
# 8. containsMoneyAction
money_words = ['rs', '₹', 'free', 'win', 'prize', 'cash', 'money', 'credit', 'loan', 'offer', 'pay', 'collect', 'bank']
containsMoneyAction = float(any(w in text for w in money_words))
# 9. capsRatio
orig_text = str(row['text'])
caps_count = sum(1 for c in orig_text if c.isupper())
letters_count = sum(1 for c in orig_text if c.isalpha())
capsRatio = caps_count / letters_count if letters_count > 0 else 0.0
# 10. hourOfDay (spam often sent at odd hours)
hourOfDay = float(np.random.choice([0,1,2,3,4,22,23])) if is_spam else float(np.random.randint(8, 20))
# 11. isWeekend
isWeekend = float(np.random.rand() > (0.3 if is_spam else 0.8))
# 12. isFestivalPeriod
isFestivalPeriod = float(np.random.rand() > 0.8)
# 13. recentScamSenderSim
recentScamSenderSim = np.random.uniform(0.5, 1.0) if is_spam else np.random.uniform(0.0, 0.2)
return [
senderAgeScore, senderInContacts, float(senderMessageCount), senderCarrierScore,
urgencyScore, float(domainCount), domainLevenshtein, containsMoneyAction, capsRatio,
hourOfDay, isWeekend, isFestivalPeriod, recentScamSenderSim
]
print("Extracting features...")
X = np.array([extract_features(row) for _, row in df.iterrows()])
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Random Forest...")
clf = RandomForestClassifier(n_estimators=15, max_depth=5, random_state=42)
clf.fit(X_train, y_train)
print("Evaluating...")
preds = clf.predict(X_test)
print(classification_report(y_test, preds))
print("Exporting model to Dart with m2cgen...")
dart_code = m2c.export_to_dart(clf)
# Write to our model file
file_path = "lib/kavacha/models/random_forest_model.dart"
header = """// Auto-generated using m2cgen
import 'sms_feature_vector.dart';
class RandomForestModel {
double score(SmsFeatureVector features) {
List<double> input = [
features.senderAgeScore,
features.senderInContacts ? 1.0 : 0.0,
features.senderMessageCount.toDouble(),
features.senderCarrierScore,
features.urgencyScore,
features.domainCount.toDouble(),
features.domainLevenshtein,
features.containsMoneyAction ? 1.0 : 0.0,
features.capsRatio,
features.hourOfDay.toDouble(),
features.isWeekend ? 1.0 : 0.0,
features.isFestivalPeriod ? 1.0 : 0.0,
features.recentScamSenderSim
];
List<double> output = score_features(input);
// output[1] is the probability of class 1 (spam)
return output[1];
}
"""
with open(file_path, "w") as f:
f.write(header)
# The generated code usually has a class or double[] score(double[] input)
# We will just write it straight out, maybe replace the class wrapper
code = dart_code.replace("double[]", "List<double>").replace("public double[] score", "List<double> score_features")
# m2cgen generates a single function / class. We strip "public class Model {" etc if present.
# Luckily Dart export is typically `List<double> score(List<double> input)`
f.write(code)
f.write("\n}\n")
print("Done writing to " + file_path)
|