File size: 748 Bytes
0e2fe46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | import fasttext
import pandas as pd
# Try different encodings if UTF-8 does not work
try:
data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='utf-8')
except UnicodeDecodeError:
data = pd.read_csv('sms_spam_phishing_dataset.csv', encoding='ISO-8859-1') # Try latin1 encoding
# Preprocess data: format as fastText expects (each line: "__label__<label> <text>")
data['ft_format'] = data.apply(lambda row: f'__label__{row["Label"]} {row["Message"]}', axis=1)
# Save preprocessed data
data['ft_format'].to_csv('ft_data.txt', index=False, header=False)
# Train a supervised model
model = fasttext.train_supervised(input='ft_data.txt', epoch=25, lr=1.0, wordNgrams=2)
# Save the model
model.save_model('ots_sms_model_v1.1.bin')
|