Spaces:

shdnalssheddi
/

Mirsad-classifier

Sleeping

App Files Files Community

Mirsad-classifier / app.py

shdnalssheddi

Update app.py

3dda458 verified about 1 year ago

raw

history blame contribute delete

9.37 kB

	from huggingface_hub import InferenceClient

	# -- coding: utf-8 --
	"""Mirsad-model-only.ipynb
	Automatically generated by Colab.
	Original file is located at
	https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE
	"""

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	from nltk.stem.porter import PorterStemmer
	from nltk.stem import WordNetLemmatizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.svm import SVC
	from sklearn.model_selection import cross_val_score
	from matplotlib.colors import ListedColormap
	from sklearn import metrics
	from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score
	from sklearn.metrics import ConfusionMatrixDisplay # Import ConfusionMatrixDisplay insteadf

	"""### load data:"""

	# Load the CSV file into a DataFrame
	file_path = 'spam.csv'
	data = pd.read_csv(file_path,encoding='latin-1')

	# Display the first few rows of the dataset
	print(data.head())

	"""dropping columns and renaming:
	"""

	# Dropping the redundent looking collumns (for this project)
	to_drop = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
	data = data.drop(data[to_drop], axis=1)
	# Renaming the columns
	data.rename(columns = {"v1":"Target", "v2":"Text"}, inplace = True)

	"""# Feature Engineering: Adding New Columns"""

	import re

	# Function to detect phone numbers (e.g., formats like (123) 456-7890 or +1 123 456 7890)
	def contains_phone_number(text):
	phone_pattern = re.compile(r'\b(\+?\d{1,2}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b')
	return 1 if phone_pattern.search(text) else 0

	# Adding the column 'Phone'
	data['Phone'] = data['Text'].apply(contains_phone_number)

	# Function to detect URLs
	def contains_url(text):
	url_pattern = re.compile(r'(https?://\S+\|www\.\S+)')
	return 1 if url_pattern.search(text) else 0

	# Adding the column 'URL'
	data['URL'] = data['Text'].apply(contains_url)

	# Function to detect email addresses
	def contains_email(text):
	email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b')
	return 1 if email_pattern.search(text) else 0

	# Adding the column 'Email'
	data['Email'] = data['Text'].apply(contains_email)

	# List of spam indicator words
	spam_keywords = ['warning', 'urgent', 'prize', 'win', 'free', 'claim', 'congratulations', 'offer',"guarantee", "low rates", "credit", "investment", "mortgage", "cash", "save big","act now", "limited time", "hurry", "final notice", "immediate","win", "bonus", "exclusive deal", "special promotion", "offer ends soon","click here", "claim now", "sign up", "subscribe", "apply now", "order now","no risk", "100% free", "no strings attached", "instant results", "guaranteed","winner", "luxury", "cheap", "discount", "bargain", "unlimited access"]

	# Function to detect if any spam keywords are in the message
	def contains_spam_words(text):
	for word in spam_keywords:
	if word in text.lower():
	return 1
	return 0

	# Adding the column 'Word_Of_Mouth'
	# data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words)

	# Defining a function to clean up the text
	def Clean(Text):
	sms = re.sub('[^a-zA-Z]', ' ', Text) #Replacing all non-alphabetic characters with a space
	sms = sms.lower() #converting to lowecase
	sms = sms.split()
	sms = ' '.join(sms)
	return sms

	data["Text"] = data["Text"].apply(Clean)

	import nltk
	nltk.download('punkt_tab')


	data["Tokenize_Text"]=data.apply(lambda row: nltk.word_tokenize(row["Text"]), axis=1)

	from nltk.corpus import stopwords
	nltk.download('stopwords')

	# Removing the stopwords function
	def remove_stopwords(text):
	stop_words = set(stopwords.words("english"))
	filtered_text = [word for word in text if word not in stop_words]
	return filtered_text

	data["Nostopword_Text"] = data["Tokenize_Text"].apply(remove_stopwords)

	import nltk
	nltk.download('wordnet')
	lemmatizer = WordNetLemmatizer()
	# lemmatize string
	def lemmatize_word(text):
	#word_tokens = word_tokenize(text)
	# provide context i.e. part-of-speech
	lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text]
	return lemmas

	data["Lemmatized_Text"] = data["Nostopword_Text"].apply(lemmatize_word)

	#Creating a corpus of text feature to encode further into vectorized form
	corpus= []
	for i in data["Lemmatized_Text"]:
	msg = ' '.join([row for row in i])
	corpus.append(msg)

	corpus[:5]

	#Changing text data in to numbers.
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB

	# Vectorizing the text messages
	tfidf = TfidfVectorizer(max_features=3000)
	X_tfidf = tfidf.fit_transform(corpus).toarray()


	# Combining the TF-IDF matrix with the new feature columns
	X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email']].values))
	#Let's have a look at our feature
	X_tfidf.dtype

	"""3-Encoding the Target class"""

	#Label encode the Target and use it as y
	label_encoder = LabelEncoder()
	data["Target"] = label_encoder.fit_transform(data["Target"])

	# Defining the target
	y = data['Target']

	# Splitting the dataset
	X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.3, random_state=42)

	from imblearn.over_sampling import SMOTE

	# Initialize SMOTE
	smote = SMOTE(random_state=42)

	# Fit and resample the training data
	X_train, y_train = smote.fit_resample(X_train, y_train)

	from sklearn.svm import SVC
	from sklearn.metrics import accuracy_score, classification_report

	# Train the Naive Bayes model
	svc_model = SVC(random_state=42, probability=True)
	svc_model.fit(X_train, y_train)

	# Test the model
	y_pred_svc = svc_model.predict(X_test)
	accuracy_svc = accuracy_score(y_test, y_pred_svc)

	# Function to classify a message and provide justification
	def classify_message(message):
	import numpy as np

	# Preprocess the input message
	message_cleaned = Clean(message) # Cleaning the message
	message_tokens = nltk.word_tokenize(message_cleaned) # Tokenizing
	message_no_stopwords = remove_stopwords(message_tokens) # Removing stopwords
	message_lemmatized = lemmatize_word(message_no_stopwords) # Lemmatization
	message_corpus = ' '.join(message_lemmatized) # Joining into a single string

	# Convert message to feature vector
	message_tfidf = tfidf.transform([message_corpus]).toarray()

	# Extract additional features
	phone_feature = contains_phone_number(message)
	url_feature = contains_url(message)
	email_feature = contains_email(message)
	spam_word_feature = contains_spam_words(message)

	# Combine all features
	message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature]]))

	# Predict using the trained model
	prediction = svc_model.predict(message_features)
	probability = svc_model.predict_proba(message_features)[0][1] # Probability of being spam

	# Provide justification
	justifications = []
	if phone_feature and prediction[0] == 1:
	justifications.append("a phone number, which is often used in spam messages")
	if url_feature and prediction[0] == 1:
	justifications.append("a link, a common element in spam content")
	if email_feature and prediction[0] == 1:
	justifications.append("an email address, which may indicate promotional or smishing intent")
	if spam_word_feature and prediction[0] == 1:
	justifications.append("language commonly found in spam messages")
	if not justifications:
	justifications.append("no clear signs of spam were found in the message")

	# Return result
	label = "Spam" if prediction[0] == 1 else "Not Spam"
	justification = "The reason for this classification is that the message includes " + ", and ".join(justifications)
	return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"}


	import gradio as gr

	# Gradio interface function
	def gradio_interface(message):
	result = classify_message(message)
	return (
	f"Label: {result['Label']}\n"
	f"Justification: {result['Justification']}\n"
	f"Spam Probability: {result['Spam Probability']}"
	)

	# Gradio app
	interface = gr.Interface(
	fn=gradio_interface,
	inputs=gr.Textbox(
	lines=4,
	placeholder="Enter a message to classify...",
	label="Input Message"
	),
	outputs=gr.Markdown(),
	title="MIRSAD",
	description=(
	"Mirsad classifies a given message as spam or not spam. "
	"It provides a justification for the classification and indicates the likelihood of the message being spam."
	),
	examples=[
	["Congratulations! You've won a free iPhone. Click here to claim now!"],
	["Meeting scheduled at 3 PM. Let me know if you can make it."],
	["Get a low-interest mortgage today! No risk, apply now."],
	["Reminder: Your appointment is tomorrow at 10:00 AM."],
	],
	theme="default"
	)

	# Launch the app
	interface.launch()