Spaces:

leygit
/

ITI110_Spam_Classification_Project

Sleeping

App Files Files Community

leygit commited on Feb 26, 2025

Commit

fd64ed7

verified ·

1 Parent(s): 6760c84

Rename app2test.py to app.py

Browse files

Files changed (1) hide show

app2test.py → app.py +32 -111

app2test.py → app.py RENAMED Viewed

@@ -1,4 +1,4 @@
-#DISTILLBERT RUN 3 , added weight_decay=0.01
 import pandas as pd
 import torch
 import torch.nn as nn
@@ -8,77 +8,24 @@ from torch.utils.data import Dataset, DataLoader
 from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report
-from transformers import BertTokenizer
-# Load dataset
-file_path = 'spam_ham_dataset.csv'
-df = pd.read_csv(file_path)
-# Convert labels to numeric
-df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
 # Load tokenizer
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-# Tokenize dataset
-encodings = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
-labels = torch.tensor(df['label_num'].values)
-# Custom Dataset
-class SpamDataset(Dataset):
-    def __init__(self, encodings, labels):
-        self.encodings = encodings
-        self.labels = labels
-    def __len__(self):
-        return len(self.labels)
-    def __getitem__(self, idx):
-        item = {key: val[idx] for key, val in self.encodings.items()}
-        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
-        return item
-# Create dataset
-dataset = SpamDataset(encodings, labels)
-# Split dataset (80% train, 20% validation)
-train_size = int(0.8 * len(dataset))
-val_size = len(dataset) - train_size
-train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
-# DataLoader with batch size
-def collate_fn(batch):
-    keys = batch[0].keys()
-    return {key: torch.stack([b[key] for b in batch]) for key in keys}
-train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
-val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
 # Load the trained model
 def load_model(model_path="distilbert_spam_model.pt"):
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
-    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))  # Load model weights
     model.eval()  # Set model to evaluation mode
     return model
-# Evaluation
-model.eval()
-correct = 0
-total = 0
-with torch.no_grad():
-    for batch in val_loader:
-        inputs = {key: val.to(device) for key, val in batch.items()}
-        labels = inputs.pop("labels").to(device)
-        outputs = model(**inputs)
-        predictions = torch.argmax(outputs.logits, dim=1)
-        correct += (predictions == labels).sum().item()
-        total += labels.size(0)
-accuracy = correct / total
-print(f"Validation Accuracy: {accuracy:.4f}")
 # Classification function
 def classify_email(email_text):
@@ -144,73 +91,47 @@ def evaluate_model_with_report(val_loader):
     return accuracy
-# Run evaluation with classification report
-accuracy = evaluate_model_with_report(val_loader)
-print(f"Model Validation Accuracy: {accuracy:.4f}")
-## Gradio Interface
-import gradio as gr
-# Create Gradio Interface
 def create_interface():
-    performance_metrics = generate_performance_metrics()
-    # Introduction - Title + Brief Description
-    with gr.Blocks(css=custom_css) as interface:
         gr.Markdown("Spam Email Classification")
-        gr.Markdown(
-            """
-            Brief description of the project here
-            """
-        )
         # Email Text Input
-        with gr.Row():
-            email_input = gr.Textbox(
-                lines=8, placeholder="Type or paste your email content here...", label="Email Content"
-            )
         # Email Text Results and Analysis
-        with gr.Row():
-            result_output = gr.HTML(label="Classification Result") # label = [function that prints classification result]
-            confidence_output = gr.Textbox(label="Confidence Score", interactive=False)
-            accuracy_output = gr.Textbox(label="Accuracy", interactive=False)
         analyze_button = gr.Button("Analyze Email 🕵️‍♂️")
         analyze_button.click(
-            fn=email_analysis_pipeline,
             inputs=email_input,
             outputs=[result_output, confidence_output, accuracy_output]
         )
-        # Analysis
         gr.Markdown("## 📊 Model Performance Analytics")
         with gr.Row():
-            with gr.Column():
-                gr.Textbox(value=performance_metrics["accuracy"], label="Accuracy", interactive=False, elem_classes=["metric"])
-                gr.Textbox(value=performance_metrics["precision"], label="Precision", interactive=False, elem_classes=["metric"])
-                gr.Textbox(value=performance_metrics["recall"], label="Recall", interactive=False, elem_classes=["metric"])
-                gr.Textbox(value=performance_metrics["f1_score"], label="F1 Score", interactive=False, elem_classes=["metric"])
-            with gr.Column():
-                gr.Markdown("### Confusion Matrix")
-                gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
-        gr.Markdown("## 📘 Glossary and Explanation of Labels")
-        gr.Markdown(
-            """
-            ### Labels:
-            - **Spam:** Unwanted or harmful emails flagged by the system.
-            - **Ham:** Legitimate, safe emails.
-            ### Metrics:
-            - **Accuracy:** The percentage of correct classifications.
-            - **Precision:** Out of predicted Spam, how many are actually Spam.
-            - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
-            - **F1 Score:** Harmonic mean of Precision and Recall.
-            """
-        )
     return interface

+# DISTILLBERT RUN 3 , added weight_decay=0.01
 import pandas as pd
 import torch
 import torch.nn as nn
 from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import classification_report
+import gradio as gr
+# Define device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load tokenizer
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 # Load the trained model
 def load_model(model_path="distilbert_spam_model.pt"):
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+    model.load_state_dict(torch.load(model_path, map_location=device))  # Load model weights
+    model.to(device)
     model.eval()  # Set model to evaluation mode
     return model
+# Load model globally
+model = load_model()
 # Classification function
 def classify_email(email_text):
     return accuracy
+# Performance metrics
+def generate_performance_metrics():
+    return {
+        "accuracy": "N/A",
+        "precision": "N/A",
+        "recall": "N/A",
+        "f1_score": "N/A",
+        "confusion_matrix_plot": "",
+    }
+performance_metrics = generate_performance_metrics()
+# Gradio Interface
 def create_interface():
+    with gr.Blocks() as interface:
         gr.Markdown("Spam Email Classification")
         # Email Text Input
+        email_input = gr.Textbox(
+            lines=8, placeholder="Type or paste your email content here...", label="Email Content"
+        )
         # Email Text Results and Analysis
+        result_output = gr.Textbox(label="Classification Result")
+        confidence_output = gr.Textbox(label="Confidence Score", interactive=False)
+        accuracy_output = gr.Textbox(label="Accuracy", interactive=False)
         analyze_button = gr.Button("Analyze Email 🕵️‍♂️")
         analyze_button.click(
+            fn=classify_email,
             inputs=email_input,
             outputs=[result_output, confidence_output, accuracy_output]
         )
         gr.Markdown("## 📊 Model Performance Analytics")
         with gr.Row():
+            gr.Textbox(value=performance_metrics["accuracy"], label="Accuracy", interactive=False)
+            gr.Textbox(value=performance_metrics["precision"], label="Precision", interactive=False)
+            gr.Textbox(value=performance_metrics["recall"], label="Recall", interactive=False)
+            gr.Textbox(value=performance_metrics["f1_score"], label="F1 Score", interactive=False)
     return interface