shdnalssheddi's picture
Update app.py
3dda458 verified
from huggingface_hub import InferenceClient
# -*- coding: utf-8 -*-
"""Mirsad-model-only.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay # Import ConfusionMatrixDisplay insteadf
"""### load data:"""
# Load the CSV file into a DataFrame
file_path = 'spam.csv'
data = pd.read_csv(file_path,encoding='latin-1')
# Display the first few rows of the dataset
print(data.head())
"""dropping columns and renaming:
"""
# Dropping the redundent looking collumns (for this project)
to_drop = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
data = data.drop(data[to_drop], axis=1)
# Renaming the columns
data.rename(columns = {"v1":"Target", "v2":"Text"}, inplace = True)
"""# Feature Engineering: Adding New Columns"""
import re
# Function to detect phone numbers (e.g., formats like (123) 456-7890 or +1 123 456 7890)
def contains_phone_number(text):
phone_pattern = re.compile(r'\b(\+?\d{1,2}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b')
return 1 if phone_pattern.search(text) else 0
# Adding the column 'Phone'
data['Phone'] = data['Text'].apply(contains_phone_number)
# Function to detect URLs
def contains_url(text):
url_pattern = re.compile(r'(https?://\S+|www\.\S+)')
return 1 if url_pattern.search(text) else 0
# Adding the column 'URL'
data['URL'] = data['Text'].apply(contains_url)
# Function to detect email addresses
def contains_email(text):
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
return 1 if email_pattern.search(text) else 0
# Adding the column 'Email'
data['Email'] = data['Text'].apply(contains_email)
# List of spam indicator words
spam_keywords = ['warning', 'urgent', 'prize', 'win', 'free', 'claim', 'congratulations', 'offer',"guarantee", "low rates", "credit", "investment", "mortgage", "cash", "save big","act now", "limited time", "hurry", "final notice", "immediate","win", "bonus", "exclusive deal", "special promotion", "offer ends soon","click here", "claim now", "sign up", "subscribe", "apply now", "order now","no risk", "100% free", "no strings attached", "instant results", "guaranteed","winner", "luxury", "cheap", "discount", "bargain", "unlimited access"]
# Function to detect if any spam keywords are in the message
def contains_spam_words(text):
for word in spam_keywords:
if word in text.lower():
return 1
return 0
# Adding the column 'Word_Of_Mouth'
# data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)
# Defining a function to clean up the text
def Clean(Text):
sms = re.sub('[^a-zA-Z]', ' ', Text) #Replacing all non-alphabetic characters with a space
sms = sms.lower() #converting to lowecase
sms = sms.split()
sms = ' '.join(sms)
return sms
data["Text"] = data["Text"].apply(Clean)
import nltk
nltk.download('punkt_tab')
data["Tokenize_Text"]=data.apply(lambda row: nltk.word_tokenize(row["Text"]), axis=1)
from nltk.corpus import stopwords
nltk.download('stopwords')
# Removing the stopwords function
def remove_stopwords(text):
stop_words = set(stopwords.words("english"))
filtered_text = [word for word in text if word not in stop_words]
return filtered_text
data["Nostopword_Text"] = data["Tokenize_Text"].apply(remove_stopwords)
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
#word_tokens = word_tokenize(text)
# provide context i.e. part-of-speech
lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text]
return lemmas
data["Lemmatized_Text"] = data["Nostopword_Text"].apply(lemmatize_word)
#Creating a corpus of text feature to encode further into vectorized form
corpus= []
for i in data["Lemmatized_Text"]:
msg = ' '.join([row for row in i])
corpus.append(msg)
corpus[:5]
#Changing text data in to numbers.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# Vectorizing the text messages
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(corpus).toarray()
# Combining the TF-IDF matrix with the new feature columns
X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email']].values))
#Let's have a look at our feature
X_tfidf.dtype
"""3-Encoding the Target class"""
#Label encode the Target and use it as y
label_encoder = LabelEncoder()
data["Target"] = label_encoder.fit_transform(data["Target"])
# Defining the target
y = data['Target']
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.3, random_state=42)
from imblearn.over_sampling import SMOTE
# Initialize SMOTE
smote = SMOTE(random_state=42)
# Fit and resample the training data
X_train, y_train = smote.fit_resample(X_train, y_train)
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
# Train the Naive Bayes model
svc_model = SVC(random_state=42, probability=True)
svc_model.fit(X_train, y_train)
# Test the model
y_pred_svc = svc_model.predict(X_test)
accuracy_svc = accuracy_score(y_test, y_pred_svc)
# Function to classify a message and provide justification
def classify_message(message):
import numpy as np
# Preprocess the input message
message_cleaned = Clean(message) # Cleaning the message
message_tokens = nltk.word_tokenize(message_cleaned) # Tokenizing
message_no_stopwords = remove_stopwords(message_tokens) # Removing stopwords
message_lemmatized = lemmatize_word(message_no_stopwords) # Lemmatization
message_corpus = ' '.join(message_lemmatized) # Joining into a single string
# Convert message to feature vector
message_tfidf = tfidf.transform([message_corpus]).toarray()
# Extract additional features
phone_feature = contains_phone_number(message)
url_feature = contains_url(message)
email_feature = contains_email(message)
spam_word_feature = contains_spam_words(message)
# Combine all features
message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature]]))
# Predict using the trained model
prediction = svc_model.predict(message_features)
probability = svc_model.predict_proba(message_features)[0][1] # Probability of being spam
# Provide justification
justifications = []
if phone_feature and prediction[0] == 1:
justifications.append("a phone number, which is often used in spam messages")
if url_feature and prediction[0] == 1:
justifications.append("a link, a common element in spam content")
if email_feature and prediction[0] == 1:
justifications.append("an email address, which may indicate promotional or smishing intent")
if spam_word_feature and prediction[0] == 1:
justifications.append("language commonly found in spam messages")
if not justifications:
justifications.append("no clear signs of spam were found in the message")
# Return result
label = "Spam" if prediction[0] == 1 else "Not Spam"
justification = "The reason for this classification is that the message includes " + ", and ".join(justifications)
return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}
import gradio as gr
# Gradio interface function
def gradio_interface(message):
result = classify_message(message)
return (
f"**Label:** {result['Label']}\n"
f"**Justification:** {result['Justification']}\n"
f"**Spam Probability:** {result['Spam Probability']}"
)
# Gradio app
interface = gr.Interface(
fn=gradio_interface,
inputs=gr.Textbox(
lines=4,
placeholder="Enter a message to classify...",
label="Input Message"
),
outputs=gr.Markdown(),
title="MIRSAD",
description=(
"Mirsad classifies a given message as spam or not spam. "
"It provides a justification for the classification and indicates the likelihood of the message being spam."
),
examples=[
["Congratulations! You've won a free iPhone. Click here to claim now!"],
["Meeting scheduled at 3 PM. Let me know if you can make it."],
["Get a low-interest mortgage today! No risk, apply now."],
["Reminder: Your appointment is tomorrow at 10:00 AM."],
],
theme="default"
)
# Launch the app
interface.launch()