Spaces:
Sleeping
Sleeping
| from huggingface_hub import InferenceClient | |
| # -*- coding: utf-8 -*- | |
| """Mirsad-model-only.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/12QnA8fnwQNDyKtRg0CjLXX84umecSsvE | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from nltk.stem.porter import PorterStemmer | |
| from nltk.stem import WordNetLemmatizer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.svm import SVC | |
| from sklearn.model_selection import cross_val_score | |
| from matplotlib.colors import ListedColormap | |
| from sklearn import metrics | |
| from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score, f1_score | |
| from sklearn.metrics import ConfusionMatrixDisplay # Import ConfusionMatrixDisplay insteadf | |
| """### load data:""" | |
| # Load the CSV file into a DataFrame | |
| file_path = 'spam.csv' | |
| data = pd.read_csv(file_path,encoding='latin-1') | |
| # Display the first few rows of the dataset | |
| print(data.head()) | |
| """dropping columns and renaming: | |
| """ | |
| # Dropping the redundent looking collumns (for this project) | |
| to_drop = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"] | |
| data = data.drop(data[to_drop], axis=1) | |
| # Renaming the columns | |
| data.rename(columns = {"v1":"Target", "v2":"Text"}, inplace = True) | |
| """# Feature Engineering: Adding New Columns""" | |
| import re | |
| # Function to detect phone numbers (e.g., formats like (123) 456-7890 or +1 123 456 7890) | |
| def contains_phone_number(text): | |
| phone_pattern = re.compile(r'\b(\+?\d{1,2}[-.\s]?)?(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b') | |
| return 1 if phone_pattern.search(text) else 0 | |
| # Adding the column 'Phone' | |
| data['Phone'] = data['Text'].apply(contains_phone_number) | |
| # Function to detect URLs | |
| def contains_url(text): | |
| url_pattern = re.compile(r'(https?://\S+|www\.\S+)') | |
| return 1 if url_pattern.search(text) else 0 | |
| # Adding the column 'URL' | |
| data['URL'] = data['Text'].apply(contains_url) | |
| # Function to detect email addresses | |
| def contains_email(text): | |
| email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
| return 1 if email_pattern.search(text) else 0 | |
| # Adding the column 'Email' | |
| data['Email'] = data['Text'].apply(contains_email) | |
| # List of spam indicator words | |
| spam_keywords = ['warning', 'urgent', 'prize', 'win', 'free', 'claim', 'congratulations', 'offer',"guarantee", "low rates", "credit", "investment", "mortgage", "cash", "save big","act now", "limited time", "hurry", "final notice", "immediate","win", "bonus", "exclusive deal", "special promotion", "offer ends soon","click here", "claim now", "sign up", "subscribe", "apply now", "order now","no risk", "100% free", "no strings attached", "instant results", "guaranteed","winner", "luxury", "cheap", "discount", "bargain", "unlimited access"] | |
| # Function to detect if any spam keywords are in the message | |
| def contains_spam_words(text): | |
| for word in spam_keywords: | |
| if word in text.lower(): | |
| return 1 | |
| return 0 | |
| # Adding the column 'Word_Of_Mouth' | |
| # data['Word_Of_Mouth'] = data['Text'].apply(contains_spam_words) | |
| # Defining a function to clean up the text | |
| def Clean(Text): | |
| sms = re.sub('[^a-zA-Z]', ' ', Text) #Replacing all non-alphabetic characters with a space | |
| sms = sms.lower() #converting to lowecase | |
| sms = sms.split() | |
| sms = ' '.join(sms) | |
| return sms | |
| data["Text"] = data["Text"].apply(Clean) | |
| import nltk | |
| nltk.download('punkt_tab') | |
| data["Tokenize_Text"]=data.apply(lambda row: nltk.word_tokenize(row["Text"]), axis=1) | |
| from nltk.corpus import stopwords | |
| nltk.download('stopwords') | |
| # Removing the stopwords function | |
| def remove_stopwords(text): | |
| stop_words = set(stopwords.words("english")) | |
| filtered_text = [word for word in text if word not in stop_words] | |
| return filtered_text | |
| data["Nostopword_Text"] = data["Tokenize_Text"].apply(remove_stopwords) | |
| import nltk | |
| nltk.download('wordnet') | |
| lemmatizer = WordNetLemmatizer() | |
| # lemmatize string | |
| def lemmatize_word(text): | |
| #word_tokens = word_tokenize(text) | |
| # provide context i.e. part-of-speech | |
| lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text] | |
| return lemmas | |
| data["Lemmatized_Text"] = data["Nostopword_Text"].apply(lemmatize_word) | |
| #Creating a corpus of text feature to encode further into vectorized form | |
| corpus= [] | |
| for i in data["Lemmatized_Text"]: | |
| msg = ' '.join([row for row in i]) | |
| corpus.append(msg) | |
| corpus[:5] | |
| #Changing text data in to numbers. | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.naive_bayes import MultinomialNB | |
| # Vectorizing the text messages | |
| tfidf = TfidfVectorizer(max_features=3000) | |
| X_tfidf = tfidf.fit_transform(corpus).toarray() | |
| # Combining the TF-IDF matrix with the new feature columns | |
| X_additional_features = np.column_stack((X_tfidf, data[['Phone', 'URL', 'Email']].values)) | |
| #Let's have a look at our feature | |
| X_tfidf.dtype | |
| """3-Encoding the Target class""" | |
| #Label encode the Target and use it as y | |
| label_encoder = LabelEncoder() | |
| data["Target"] = label_encoder.fit_transform(data["Target"]) | |
| # Defining the target | |
| y = data['Target'] | |
| # Splitting the dataset | |
| X_train, X_test, y_train, y_test = train_test_split(X_additional_features, y, test_size=0.3, random_state=42) | |
| from imblearn.over_sampling import SMOTE | |
| # Initialize SMOTE | |
| smote = SMOTE(random_state=42) | |
| # Fit and resample the training data | |
| X_train, y_train = smote.fit_resample(X_train, y_train) | |
| from sklearn.svm import SVC | |
| from sklearn.metrics import accuracy_score, classification_report | |
| # Train the Naive Bayes model | |
| svc_model = SVC(random_state=42, probability=True) | |
| svc_model.fit(X_train, y_train) | |
| # Test the model | |
| y_pred_svc = svc_model.predict(X_test) | |
| accuracy_svc = accuracy_score(y_test, y_pred_svc) | |
| # Function to classify a message and provide justification | |
| def classify_message(message): | |
| import numpy as np | |
| # Preprocess the input message | |
| message_cleaned = Clean(message) # Cleaning the message | |
| message_tokens = nltk.word_tokenize(message_cleaned) # Tokenizing | |
| message_no_stopwords = remove_stopwords(message_tokens) # Removing stopwords | |
| message_lemmatized = lemmatize_word(message_no_stopwords) # Lemmatization | |
| message_corpus = ' '.join(message_lemmatized) # Joining into a single string | |
| # Convert message to feature vector | |
| message_tfidf = tfidf.transform([message_corpus]).toarray() | |
| # Extract additional features | |
| phone_feature = contains_phone_number(message) | |
| url_feature = contains_url(message) | |
| email_feature = contains_email(message) | |
| spam_word_feature = contains_spam_words(message) | |
| # Combine all features | |
| message_features = np.column_stack((message_tfidf, [[phone_feature, url_feature, email_feature]])) | |
| # Predict using the trained model | |
| prediction = svc_model.predict(message_features) | |
| probability = svc_model.predict_proba(message_features)[0][1] # Probability of being spam | |
| # Provide justification | |
| justifications = [] | |
| if phone_feature and prediction[0] == 1: | |
| justifications.append("a phone number, which is often used in spam messages") | |
| if url_feature and prediction[0] == 1: | |
| justifications.append("a link, a common element in spam content") | |
| if email_feature and prediction[0] == 1: | |
| justifications.append("an email address, which may indicate promotional or smishing intent") | |
| if spam_word_feature and prediction[0] == 1: | |
| justifications.append("language commonly found in spam messages") | |
| if not justifications: | |
| justifications.append("no clear signs of spam were found in the message") | |
| # Return result | |
| label = "Spam" if prediction[0] == 1 else "Not Spam" | |
| justification = "The reason for this classification is that the message includes " + ", and ".join(justifications) | |
| return {"Label": label, "Justification": justification, "Spam Probability": f"{probability * 100:.2f}%"} | |
| import gradio as gr | |
| # Gradio interface function | |
| def gradio_interface(message): | |
| result = classify_message(message) | |
| return ( | |
| f"**Label:** {result['Label']}\n" | |
| f"**Justification:** {result['Justification']}\n" | |
| f"**Spam Probability:** {result['Spam Probability']}" | |
| ) | |
| # Gradio app | |
| interface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.Textbox( | |
| lines=4, | |
| placeholder="Enter a message to classify...", | |
| label="Input Message" | |
| ), | |
| outputs=gr.Markdown(), | |
| title="MIRSAD", | |
| description=( | |
| "Mirsad classifies a given message as spam or not spam. " | |
| "It provides a justification for the classification and indicates the likelihood of the message being spam." | |
| ), | |
| examples=[ | |
| ["Congratulations! You've won a free iPhone. Click here to claim now!"], | |
| ["Meeting scheduled at 3 PM. Let me know if you can make it."], | |
| ["Get a low-interest mortgage today! No risk, apply now."], | |
| ["Reminder: Your appointment is tomorrow at 10:00 AM."], | |
| ], | |
| theme="default" | |
| ) | |
| # Launch the app | |
| interface.launch() | |