Spaces:

shdnalssheddi
/

Mirsad-classifier

Sleeping

App Files Files Community

shdnalssheddi commited on Apr 21, 2025

Commit

3dda458

verified ·

1 Parent(s): 8fbb437

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -25

app.py CHANGED Viewed

@@ -2,9 +2,7 @@ from huggingface_hub import InferenceClient
 # -*- coding: utf-8 -*-
 """Mirsad-model-only.ipynb
 Automatically generated by Colab.
 Original file is located at
     https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
 """
@@ -40,7 +38,6 @@ data = pd.read_csv(file_path,encoding='latin-1')
 print(data.head())
 """dropping columns and renaming:
 """
 # Dropping the redundent looking collumns (for this project)
@@ -88,7 +85,7 @@ def contains_spam_words(text):
     return 0
 # Adding the column 'Word_Of_Mouth'
-data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)
 # Defining a function to clean up the text
 def Clean(Text):
@@ -148,7 +145,7 @@ X_tfidf = tfidf.fit_transform(corpus).toarray()
 # Combining the TF-IDF matrix with the new feature columns
-X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email', 'Word_Of_Mouth']].values))
 #Let's have a look at our feature
 X_tfidf.dtype
@@ -162,18 +159,26 @@ data["Target"] = label_encoder.fit_transform(data["Target"])
 y = data['Target']
 # Splitting the dataset
-X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.2, random_state=42)
-from sklearn.naive_bayes import MultinomialNB
 from sklearn.metrics import accuracy_score, classification_report
 # Train the Naive Bayes model
-nb_model = MultinomialNB()
-nb_model.fit(X_train, y_train)
 # Test the model
-y_pred_nb = nb_model.predict(X_test)
-accuracy_nb = accuracy_score(y_test, y_pred_nb)
 # Function to classify a message and provide justification
 def classify_message(message):
@@ -196,28 +201,28 @@ def classify_message(message):
     spam_word_feature = contains_spam_words(message)
     # Combine all features
-    message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature, spam_word_feature]]))
     # Predict using the trained model
-    prediction = nb_model.predict(message_features)
-    probability = nb_model.predict_proba(message_features)[0][1]  # Probability of being spam
     # Provide justification
     justifications = []
-    if phone_feature:
-        justifications.append("Contains a phone number.")
-    if url_feature:
-        justifications.append("Contains a URL.")
-    if email_feature:
-        justifications.append("Contains an email address.")
-    if spam_word_feature:
-        justifications.append("Contains spam-related keywords.")
     if not justifications:
-        justifications.append("No specific indicators of spam detected.")
     # Return result
     label = "Spam" if prediction[0] == 1 else "Not Spam"
-    justification = " | ".join(justifications)
     return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}
@@ -257,4 +262,3 @@ interface = gr.Interface(
 # Launch the app
 interface.launch()

 # -*- coding: utf-8 -*-
 """Mirsad-model-only.ipynb
 Automatically generated by Colab.
 Original file is located at
     https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
 """
 print(data.head())
 """dropping columns and renaming:
 """
 # Dropping the redundent looking collumns (for this project)
     return 0
 # Adding the column 'Word_Of_Mouth'
+# data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)
 # Defining a function to clean up the text
 def Clean(Text):
 # Combining the TF-IDF matrix with the new feature columns
+X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email']].values))
 #Let's have a look at our feature
 X_tfidf.dtype
 y = data['Target']
 # Splitting the dataset
+X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.3, random_state=42)
+from imblearn.over_sampling import SMOTE
+# Initialize SMOTE
+smote = SMOTE(random_state=42)
+# Fit and resample the training data
+X_train, y_train = smote.fit_resample(X_train, y_train)
+from sklearn.svm import SVC
 from sklearn.metrics import accuracy_score, classification_report
 # Train the Naive Bayes model
+svc_model = SVC(random_state=42, probability=True)
+svc_model.fit(X_train, y_train)
 # Test the model
+y_pred_svc = svc_model.predict(X_test)
+accuracy_svc = accuracy_score(y_test, y_pred_svc)
 # Function to classify a message and provide justification
 def classify_message(message):
     spam_word_feature = contains_spam_words(message)
     # Combine all features
+    message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature]]))
     # Predict using the trained model
+    prediction = svc_model.predict(message_features)
+    probability = svc_model.predict_proba(message_features)[0][1]  # Probability of being spam
     # Provide justification
     justifications = []
+    if phone_feature and prediction[0] == 1:
+        justifications.append("a phone number, which is often used in spam messages")
+    if url_feature and prediction[0] == 1:
+        justifications.append("a link, a common element in spam content")
+    if email_feature and prediction[0] == 1:
+        justifications.append("an email address, which may indicate promotional or smishing intent")
+    if spam_word_feature and prediction[0] == 1:
+        justifications.append("language commonly found in spam messages")
     if not justifications:
+        justifications.append("no clear signs of spam were found in the message")
     # Return result
     label = "Spam" if prediction[0] == 1 else "Not Spam"
+    justification = "The reason for this classification is that the message includes " + ", and ".join(justifications)
     return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}
 # Launch the app
 interface.launch()