shdnalssheddi commited on
Commit
3dda458
·
verified ·
1 Parent(s): 8fbb437

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -25
app.py CHANGED
@@ -2,9 +2,7 @@ from huggingface_hub import InferenceClient
2
 
3
  # -*- coding: utf-8 -*-
4
  """Mirsad-model-only.ipynb
5
-
6
  Automatically generated by Colab.
7
-
8
  Original file is located at
9
  https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
10
  """
@@ -40,7 +38,6 @@ data = pd.read_csv(file_path,encoding='latin-1')
40
  print(data.head())
41
 
42
  """dropping columns and renaming:
43
-
44
  """
45
 
46
  # Dropping the redundent looking collumns (for this project)
@@ -88,7 +85,7 @@ def contains_spam_words(text):
88
  return 0
89
 
90
  # Adding the column 'Word_Of_Mouth'
91
- data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)
92
 
93
  # Defining a function to clean up the text
94
  def Clean(Text):
@@ -148,7 +145,7 @@ X_tfidf = tfidf.fit_transform(corpus).toarray()
148
 
149
 
150
  # Combining the TF-IDF matrix with the new feature columns
151
- X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email', 'Word_Of_Mouth']].values))
152
  #Let's have a look at our feature
153
  X_tfidf.dtype
154
 
@@ -162,18 +159,26 @@ data["Target"] = label_encoder.fit_transform(data["Target"])
162
  y = data['Target']
163
 
164
  # Splitting the dataset
165
- X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.2, random_state=42)
166
 
167
- from sklearn.naive_bayes import MultinomialNB
 
 
 
 
 
 
 
 
168
  from sklearn.metrics import accuracy_score, classification_report
169
 
170
  # Train the Naive Bayes model
171
- nb_model = MultinomialNB()
172
- nb_model.fit(X_train, y_train)
173
 
174
  # Test the model
175
- y_pred_nb = nb_model.predict(X_test)
176
- accuracy_nb = accuracy_score(y_test, y_pred_nb)
177
 
178
  # Function to classify a message and provide justification
179
  def classify_message(message):
@@ -196,28 +201,28 @@ def classify_message(message):
196
  spam_word_feature = contains_spam_words(message)
197
 
198
  # Combine all features
199
- message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature, spam_word_feature]]))
200
 
201
  # Predict using the trained model
202
- prediction = nb_model.predict(message_features)
203
- probability = nb_model.predict_proba(message_features)[0][1] # Probability of being spam
204
 
205
  # Provide justification
206
  justifications = []
207
- if phone_feature:
208
- justifications.append("Contains a phone number.")
209
- if url_feature:
210
- justifications.append("Contains a URL.")
211
- if email_feature:
212
- justifications.append("Contains an email address.")
213
- if spam_word_feature:
214
- justifications.append("Contains spam-related keywords.")
215
  if not justifications:
216
- justifications.append("No specific indicators of spam detected.")
217
 
218
  # Return result
219
  label = "Spam" if prediction[0] == 1 else "Not Spam"
220
- justification = " | ".join(justifications)
221
  return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}
222
 
223
 
@@ -257,4 +262,3 @@ interface = gr.Interface(
257
 
258
  # Launch the app
259
  interface.launch()
260
-
 
2
 
3
  # -*- coding: utf-8 -*-
4
  """Mirsad-model-only.ipynb
 
5
  Automatically generated by Colab.
 
6
  Original file is located at
7
  https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
8
  """
 
38
  print(data.head())
39
 
40
  """dropping columns and renaming:
 
41
  """
42
 
43
  # Dropping the redundent looking collumns (for this project)
 
85
  return 0
86
 
87
  # Adding the column 'Word_Of_Mouth'
88
+ # data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)
89
 
90
  # Defining a function to clean up the text
91
  def Clean(Text):
 
145
 
146
 
147
  # Combining the TF-IDF matrix with the new feature columns
148
+ X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email']].values))
149
  #Let's have a look at our feature
150
  X_tfidf.dtype
151
 
 
159
  y = data['Target']
160
 
161
  # Splitting the dataset
162
+ X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.3, random_state=42)
163
 
164
+ from imblearn.over_sampling import SMOTE
165
+
166
+ # Initialize SMOTE
167
+ smote = SMOTE(random_state=42)
168
+
169
+ # Fit and resample the training data
170
+ X_train, y_train = smote.fit_resample(X_train, y_train)
171
+
172
+ from sklearn.svm import SVC
173
  from sklearn.metrics import accuracy_score, classification_report
174
 
175
  # Train the Naive Bayes model
176
+ svc_model = SVC(random_state=42, probability=True)
177
+ svc_model.fit(X_train, y_train)
178
 
179
  # Test the model
180
+ y_pred_svc = svc_model.predict(X_test)
181
+ accuracy_svc = accuracy_score(y_test, y_pred_svc)
182
 
183
  # Function to classify a message and provide justification
184
  def classify_message(message):
 
201
  spam_word_feature = contains_spam_words(message)
202
 
203
  # Combine all features
204
+ message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature]]))
205
 
206
  # Predict using the trained model
207
+ prediction = svc_model.predict(message_features)
208
+ probability = svc_model.predict_proba(message_features)[0][1] # Probability of being spam
209
 
210
  # Provide justification
211
  justifications = []
212
+ if phone_feature and prediction[0] == 1:
213
+ justifications.append("a phone number, which is often used in spam messages")
214
+ if url_feature and prediction[0] == 1:
215
+ justifications.append("a link, a common element in spam content")
216
+ if email_feature and prediction[0] == 1:
217
+ justifications.append("an email address, which may indicate promotional or smishing intent")
218
+ if spam_word_feature and prediction[0] == 1:
219
+ justifications.append("language commonly found in spam messages")
220
  if not justifications:
221
+ justifications.append("no clear signs of spam were found in the message")
222
 
223
  # Return result
224
  label = "Spam" if prediction[0] == 1 else "Not Spam"
225
+ justification = "The reason for this classification is that the message includes " + ", and ".join(justifications)
226
  return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}
227
 
228
 
 
262
 
263
  # Launch the app
264
  interface.launch()