Email_Spam_detection_using_machine_learning

Runtime error

App Files Files Community

ABHI010 commited on Dec 6, 2024

Commit

03df076

verified ·

1 Parent(s): 53ceef1

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -19

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
@@ -52,7 +51,7 @@ spam_keywords = [
 def highlight_keywords(text):
     highlighted = text
     for keyword in spam_keywords:
-        pattern = re.compile(rf"{keyword}", re.IGNORECASE)
         highlighted = pattern.sub(f"<span class='highlight'>{keyword}</span>", highlighted)
     return highlighted
@@ -105,6 +104,32 @@ def generate_performance_metrics():
         "confusion_matrix_plot": img_base64,
     }
 # Updated CSS
 custom_css = """
 body {
@@ -115,20 +140,17 @@ body {
     background-attachment: fixed;
     color: #333;
 }
 h1, h2, h3 {
     text-align: center;
     color: #ffffff;
     text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
 }
 .gradio-container {
     background-color: rgba(255, 255, 255, 0.8);
     border-radius: 10px;
     padding: 20px;
     box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
 }
 button {
     background-color: #1e90ff;
     color: white;
@@ -139,19 +161,16 @@ button {
     font-size: 1.2em;
     transition: transform 0.2s, background-color 0.3s;
 }
 button:hover {
     background-color: #1c86ee;
     transform: scale(1.05);
 }
 .highlight {
     background-color: #ffeb3b;
     font-weight: bold;
     padding: 0 3px;
     border-radius: 3px;
 }
 .metric {
     font-size: 1.2em;
     text-align: center;
@@ -204,13 +223,7 @@ def create_interface():
         analyze_button.click(
             fn=email_analysis_pipeline,
             inputs=email_input,
-            outputs=[
-                result_output,
-                confidence_output,
-                highlighted_output,
-                keywords_output,
-                advice_output
-            ]
         )
         gr.Markdown("## 📊 Model Performance Analytics")
@@ -224,6 +237,22 @@ def create_interface():
                 gr.Markdown("### Confusion Matrix")
                 gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
         gr.Markdown("## 📘 Glossary and Explanation of Labels")
         gr.Markdown(
             """
@@ -231,14 +260,19 @@ def create_interface():
             - **Spam:** Unwanted or harmful emails flagged by the system.
             - **Ham:** Legitimate, safe emails.
             ### Metrics:
-            - **Accuracy:** Percentage of correct classifications.
             - **Precision:** Out of predicted Spam, how many are actually Spam.
             - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
             - **F1 Score:** Harmonic mean of Precision and Recall.
-            ### Confusion Matrix:
-            Shows the distribution of true vs predicted labels.
             """
         )
@@ -247,3 +281,4 @@ def create_interface():
 # Launch the interface
 interface = create_interface()
 interface.launch(share=True)

 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
 def highlight_keywords(text):
     highlighted = text
     for keyword in spam_keywords:
+        pattern = re.compile(rf"(\b{keyword}\b)", re.IGNORECASE)
         highlighted = pattern.sub(f"<span class='highlight'>{keyword}</span>", highlighted)
     return highlighted
         "confusion_matrix_plot": img_base64,
     }
+# Function to add new email data and retrain the model
+def save_and_retrain(email_text, label):
+    try:
+        # Convert label to numeric value (0 for Ham, 1 for Spam)
+        label_numeric = 1 if label == "Spam" else 0
+        # Add the new data to the dataset
+        new_data = pd.DataFrame({"text": [email_text], "spam": [label_numeric]})
+        global dataset, X, y, model, vectorizer
+        dataset = pd.concat([dataset, new_data], ignore_index=True)
+        # Vectorize the updated text data
+        X = vectorizer.fit_transform(dataset['text'])
+        y = dataset['spam']
+        # Retrain the model
+        model.fit(X, y)
+        # Save the updated model and vectorizer
+        joblib.dump(model, 'spam_model.pkl')
+        joblib.dump(vectorizer, 'spam_vectorizer.pkl')
+        return "Model retrained successfully with new data!"
+    except Exception as e:
+        return f"Error while retraining: {str(e)}"
 # Updated CSS
 custom_css = """
 body {
     background-attachment: fixed;
     color: #333;
 }
 h1, h2, h3 {
     text-align: center;
     color: #ffffff;
     text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
 }
 .gradio-container {
     background-color: rgba(255, 255, 255, 0.8);
     border-radius: 10px;
     padding: 20px;
     box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
 }
 button {
     background-color: #1e90ff;
     color: white;
     font-size: 1.2em;
     transition: transform 0.2s, background-color 0.3s;
 }
 button:hover {
     background-color: #1c86ee;
     transform: scale(1.05);
 }
 .highlight {
     background-color: #ffeb3b;
     font-weight: bold;
     padding: 0 3px;
     border-radius: 3px;
 }
 .metric {
     font-size: 1.2em;
     text-align: center;
         analyze_button.click(
             fn=email_analysis_pipeline,
             inputs=email_input,
+            outputs=[result_output, confidence_output, highlighted_output, keywords_output, advice_output]
         )
         gr.Markdown("## 📊 Model Performance Analytics")
                 gr.Markdown("### Confusion Matrix")
                 gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
+        gr.Markdown("## 🛠️ Save and Retrain the Model")
+        with gr.Row():
+            email_for_retraining = gr.Textbox(
+                lines=8, placeholder="Enter the email content to label as Spam or Ham and retrain", label="Email Content"
+            )
+            label_input = gr.Radio(["Spam", "Ham"], label="Label", type="value")
+        retrain_button = gr.Button("Save & Retrain Model")
+        retrain_result = gr.Textbox(label="Retrain Result", interactive=False)
+        retrain_button.click(
+            fn=save_and_retrain,
+            inputs=[email_for_retraining, label_input],
+            outputs=retrain_result
+        )
         gr.Markdown("## 📘 Glossary and Explanation of Labels")
         gr.Markdown(
             """
             - **Spam:** Unwanted or harmful emails flagged by the system.
             - **Ham:** Legitimate, safe emails.
+            ### Confusion Matrix:
+            The confusion matrix shows the performance of the model by comparing the true labels with the predicted ones.
+            It consists of:
+            - **True Positives (TP):** Correctly predicted spam emails.
+            - **True Negatives (TN):** Correctly predicted ham emails.
+            - **False Positives (FP):** Ham emails incorrectly predicted as spam.
+            - **False Negatives (FN):** Spam emails incorrectly predicted as ham.
             ### Metrics:
+            - **Accuracy:** The percentage of correct classifications.
             - **Precision:** Out of predicted Spam, how many are actually Spam.
             - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
             - **F1 Score:** Harmonic mean of Precision and Recall.
             """
         )
 # Launch the interface
 interface = create_interface()
 interface.launch(share=True)