Update app.py
Browse filesUsing distilledBert Run 3 with best accuracy of 0.9942
app.py
CHANGED
|
@@ -1,102 +1,15 @@
|
|
| 1 |
-
#
|
| 2 |
-
"""Untitled3.ipynb
|
| 3 |
|
| 4 |
-
Automatically generated by Colab.
|
| 5 |
-
|
| 6 |
-
Original file is located at
|
| 7 |
-
https://colab.research.google.com/drive/1BTaF9lue6oXAqEx5zFRq1cnWWQ9YKCiQ
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import pandas as pd
|
| 11 |
-
import numpy as np
|
| 12 |
-
import torch
|
| 13 |
-
from transformers import BertTokenizer
|
| 14 |
-
import seaborn as sns
|
| 15 |
-
import matplotlib.pyplot as plt
|
| 16 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# Load dataset
|
| 20 |
-
file_path = 'spam_ham_dataset.csv'
|
| 21 |
-
df = pd.read_csv(file_path)
|
| 22 |
-
df.head()
|
| 23 |
-
|
| 24 |
-
# Preprocessing
|
| 25 |
-
#.str.replace(r'[^\w\s]', '', regex=True) removes everthing except letters, numbers, and spaces
|
| 26 |
-
# df['text'].str.lower() converts everything in the text column to lower case only
|
| 27 |
-
df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
|
| 28 |
-
df['text'].head()
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
sns.countplot(x=df['label'])
|
| 32 |
-
plt.title("Spam vs Ham Distribution")
|
| 33 |
-
plt.show()
|
| 34 |
-
|
| 35 |
-
# Calculate text length metrics
|
| 36 |
-
df['char_count'] = df['text'].apply(len)
|
| 37 |
-
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
|
| 38 |
-
# Plot word count distribution for spam and ham
|
| 39 |
-
plt.figure(figsize=(12, 5))
|
| 40 |
-
sns.histplot(data=df, x='word_count', hue='label', bins=30, kde=True)
|
| 41 |
-
plt.xlim(0, 1000)
|
| 42 |
-
plt.title("Word Count Distribution by Label")
|
| 43 |
-
plt.xlabel("Number of Words")
|
| 44 |
-
plt.ylabel("Frequency")
|
| 45 |
-
plt.show()
|
| 46 |
-
|
| 47 |
-
def get_top_words(corpus, n=None):
|
| 48 |
-
vec = CountVectorizer(stop_words='english').fit(corpus)
|
| 49 |
-
bag_of_words = vec.transform(corpus)
|
| 50 |
-
sum_words = bag_of_words.sum(axis=0)
|
| 51 |
-
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
|
| 52 |
-
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
|
| 53 |
-
return words_freq[:n]
|
| 54 |
-
|
| 55 |
-
# Top 10 words for spam
|
| 56 |
-
top_spam_words = get_top_words(df[df['label'] == "spam"]['text'], n=10)
|
| 57 |
-
print("Top spam words:", top_spam_words)
|
| 58 |
-
|
| 59 |
-
# Top 10 words for ham
|
| 60 |
-
top_ham_words = get_top_words(df[df['label'] == "ham"]['text'], n=10)
|
| 61 |
-
print("Top ham words:", top_ham_words)
|
| 62 |
-
|
| 63 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 64 |
-
from sklearn.naive_bayes import MultinomialNB
|
| 65 |
-
from sklearn.metrics import classification_report
|
| 66 |
-
|
| 67 |
-
# TF-IDF Vectorization
|
| 68 |
-
vectorizer = TfidfVectorizer()
|
| 69 |
-
X = vectorizer.fit_transform(df['text'])
|
| 70 |
-
y = df['label_num']
|
| 71 |
-
|
| 72 |
-
# Train-Test Split
|
| 73 |
-
from sklearn.model_selection import train_test_split
|
| 74 |
-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 75 |
-
|
| 76 |
-
# Train Naïve Bayes Model
|
| 77 |
-
nb_model = MultinomialNB()
|
| 78 |
-
nb_model.fit(X_train, y_train)
|
| 79 |
-
|
| 80 |
-
# Predictions
|
| 81 |
-
y_pred = nb_model.predict(X_test)
|
| 82 |
-
print(classification_report(y_test, y_pred))
|
| 83 |
-
|
| 84 |
-
import pandas as pd
|
| 85 |
-
import torch
|
| 86 |
-
import torch.nn as nn
|
| 87 |
-
import torch.optim as optim
|
| 88 |
-
from transformers import BertTokenizer, BertForSequenceClassification
|
| 89 |
-
from torch.utils.data import Dataset, DataLoader
|
| 90 |
|
| 91 |
# Load dataset
|
| 92 |
file_path = 'spam_ham_dataset.csv'
|
| 93 |
df = pd.read_csv(file_path)
|
| 94 |
|
| 95 |
-
# Convert
|
| 96 |
-
df['label_num'] = df['label'].
|
| 97 |
|
| 98 |
# Load tokenizer
|
| 99 |
-
tokenizer =
|
| 100 |
|
| 101 |
# Tokenize dataset
|
| 102 |
encodings = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
|
|
@@ -112,8 +25,8 @@ class SpamDataset(Dataset):
|
|
| 112 |
return len(self.labels)
|
| 113 |
|
| 114 |
def __getitem__(self, idx):
|
| 115 |
-
item = {key: val[idx] for key, val in self.encodings.items()}
|
| 116 |
-
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
|
| 117 |
return item
|
| 118 |
|
| 119 |
# Create dataset
|
|
@@ -124,28 +37,25 @@ train_size = int(0.8 * len(dataset))
|
|
| 124 |
val_size = len(dataset) - train_size
|
| 125 |
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
|
| 126 |
|
| 127 |
-
# DataLoader
|
| 128 |
def collate_fn(batch):
|
| 129 |
keys = batch[0].keys()
|
| 130 |
-
|
| 131 |
-
return collated
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)
|
| 136 |
|
| 137 |
-
# Load
|
| 138 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 139 |
-
model =
|
| 140 |
model.to(device)
|
| 141 |
|
| 142 |
# Define optimizer and loss function
|
| 143 |
-
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
|
| 144 |
loss_fn = nn.CrossEntropyLoss()
|
| 145 |
|
| 146 |
# Training Loop
|
| 147 |
EPOCHS = 10
|
| 148 |
-
|
| 149 |
for epoch in range(EPOCHS):
|
| 150 |
model.train()
|
| 151 |
total_loss = 0
|
|
@@ -153,15 +63,12 @@ for epoch in range(EPOCHS):
|
|
| 153 |
for batch in train_loader:
|
| 154 |
optimizer.zero_grad()
|
| 155 |
|
| 156 |
-
# Move batch to device
|
| 157 |
inputs = {key: val.to(device) for key, val in batch.items()}
|
| 158 |
-
labels = inputs.pop("labels").to(device)
|
| 159 |
|
| 160 |
-
# Forward pass
|
| 161 |
outputs = model(**inputs)
|
| 162 |
loss = loss_fn(outputs.logits, labels)
|
| 163 |
|
| 164 |
-
# Backward pass
|
| 165 |
loss.backward()
|
| 166 |
optimizer.step()
|
| 167 |
|
|
@@ -170,7 +77,25 @@ for epoch in range(EPOCHS):
|
|
| 170 |
avg_loss = total_loss / len(train_loader)
|
| 171 |
print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
|
| 172 |
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
from sklearn.metrics import classification_report
|
| 176 |
from transformers import BertTokenizer
|
|
@@ -245,30 +170,9 @@ def evaluate_model_with_report(val_loader):
|
|
| 245 |
accuracy = evaluate_model_with_report(val_loader)
|
| 246 |
print(f"Model Validation Accuracy: {accuracy:.4f}")
|
| 247 |
|
| 248 |
-
## App Deployment Functions
|
| 249 |
-
|
| 250 |
-
def generate_performance_metrics():
|
| 251 |
-
y_pred = model.predict(X_test)
|
| 252 |
-
accuracy = evaluate_model_with_report(val_loader)
|
| 253 |
-
report = classification_report(y_true, y_pred, target_names=["Ham", "Spam"])
|
| 254 |
-
return {
|
| 255 |
-
"accuracy": f"{accuracy:.2%}",
|
| 256 |
-
"precision": f"{report['1']['precision']:.2%}",
|
| 257 |
-
"recall": f"{report['1']['recall']:.2%}",
|
| 258 |
-
"f1_score": f"{report['1']['f1-score']:.2%}"
|
| 259 |
-
}
|
| 260 |
-
|
| 261 |
-
def email_analysis_pipeline(email_text):
|
| 262 |
-
results = classify_email(email_text)
|
| 263 |
-
accuracy = evaluate_model_with_report(val_loader)
|
| 264 |
-
return {
|
| 265 |
-
results["result"],
|
| 266 |
-
results["confidence"],
|
| 267 |
-
accuracy
|
| 268 |
-
}
|
| 269 |
-
|
| 270 |
## Gradio Interface
|
| 271 |
|
|
|
|
| 272 |
import gradio as gr
|
| 273 |
|
| 274 |
# Create Gradio Interface
|
|
@@ -338,58 +242,3 @@ def create_interface():
|
|
| 338 |
# Launch the interface
|
| 339 |
interface = create_interface()
|
| 340 |
interface.launch(share=True)
|
| 341 |
-
|
| 342 |
-
## CSS
|
| 343 |
-
|
| 344 |
-
# Updated CSS
|
| 345 |
-
custom_css = """
|
| 346 |
-
body {
|
| 347 |
-
font-family: 'Arial', sans-serif;
|
| 348 |
-
background-image: url('https://cdn.pixabay.com/photo/2016/11/19/15/26/email-1839873_1280.jpg');
|
| 349 |
-
background-size: cover;
|
| 350 |
-
background-position: center;
|
| 351 |
-
background-attachment: fixed;
|
| 352 |
-
color: #333;
|
| 353 |
-
}
|
| 354 |
-
h1, h2, h3 {
|
| 355 |
-
text-align: center;
|
| 356 |
-
color: #ffffff;
|
| 357 |
-
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
|
| 358 |
-
}
|
| 359 |
-
.gradio-container {
|
| 360 |
-
background-color: rgba(255, 255, 255, 0.8);
|
| 361 |
-
border-radius: 10px;
|
| 362 |
-
padding: 20px;
|
| 363 |
-
box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
|
| 364 |
-
}
|
| 365 |
-
button {
|
| 366 |
-
background-color: #1e90ff;
|
| 367 |
-
color: white;
|
| 368 |
-
padding: 10px 20px;
|
| 369 |
-
border: none;
|
| 370 |
-
border-radius: 5px;
|
| 371 |
-
cursor: pointer;
|
| 372 |
-
font-size: 1.2em;
|
| 373 |
-
transition: transform 0.2s, background-color 0.3s;
|
| 374 |
-
}
|
| 375 |
-
button:hover {
|
| 376 |
-
background-color: #1c86ee;
|
| 377 |
-
transform: scale(1.05);
|
| 378 |
-
}
|
| 379 |
-
.highlight {
|
| 380 |
-
background-color: #ffeb3b;
|
| 381 |
-
font-weight: bold;
|
| 382 |
-
padding: 0 3px;
|
| 383 |
-
border-radius: 3px;
|
| 384 |
-
}
|
| 385 |
-
.metric {
|
| 386 |
-
font-size: 1.2em;
|
| 387 |
-
text-align: center;
|
| 388 |
-
color: #ffffff;
|
| 389 |
-
background-color: #4CAF50;
|
| 390 |
-
border-radius: 8px;
|
| 391 |
-
padding: 10px;
|
| 392 |
-
margin: 10px 0;
|
| 393 |
-
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
|
| 394 |
-
}
|
| 395 |
-
"""
|
|
|
|
| 1 |
+
#DISTILLBERT RUN 3 , added weight_decay=0.01
|
|
|
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# Load dataset
|
| 5 |
file_path = 'spam_ham_dataset.csv'
|
| 6 |
df = pd.read_csv(file_path)
|
| 7 |
|
| 8 |
+
# Convert labels to numeric
|
| 9 |
+
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
|
| 10 |
|
| 11 |
# Load tokenizer
|
| 12 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
| 13 |
|
| 14 |
# Tokenize dataset
|
| 15 |
encodings = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
|
|
|
|
| 25 |
return len(self.labels)
|
| 26 |
|
| 27 |
def __getitem__(self, idx):
|
| 28 |
+
item = {key: val[idx] for key, val in self.encodings.items()}
|
| 29 |
+
item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
|
| 30 |
return item
|
| 31 |
|
| 32 |
# Create dataset
|
|
|
|
| 37 |
val_size = len(dataset) - train_size
|
| 38 |
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
|
| 39 |
|
| 40 |
+
# DataLoader with batch size
|
| 41 |
def collate_fn(batch):
|
| 42 |
keys = batch[0].keys()
|
| 43 |
+
return {key: torch.stack([b[key] for b in batch]) for key in keys}
|
|
|
|
| 44 |
|
| 45 |
+
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
|
| 46 |
+
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
|
|
|
|
| 47 |
|
| 48 |
+
# Load DistilBERT model
|
| 49 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 50 |
+
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
| 51 |
model.to(device)
|
| 52 |
|
| 53 |
# Define optimizer and loss function
|
| 54 |
+
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
|
| 55 |
loss_fn = nn.CrossEntropyLoss()
|
| 56 |
|
| 57 |
# Training Loop
|
| 58 |
EPOCHS = 10
|
|
|
|
| 59 |
for epoch in range(EPOCHS):
|
| 60 |
model.train()
|
| 61 |
total_loss = 0
|
|
|
|
| 63 |
for batch in train_loader:
|
| 64 |
optimizer.zero_grad()
|
| 65 |
|
|
|
|
| 66 |
inputs = {key: val.to(device) for key, val in batch.items()}
|
| 67 |
+
labels = inputs.pop("labels").to(device)
|
| 68 |
|
|
|
|
| 69 |
outputs = model(**inputs)
|
| 70 |
loss = loss_fn(outputs.logits, labels)
|
| 71 |
|
|
|
|
| 72 |
loss.backward()
|
| 73 |
optimizer.step()
|
| 74 |
|
|
|
|
| 77 |
avg_loss = total_loss / len(train_loader)
|
| 78 |
print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
|
| 79 |
|
| 80 |
+
# Save trained model
|
| 81 |
+
torch.save(model.state_dict(), "distilbert_spam_model.pt")
|
| 82 |
+
|
| 83 |
+
# Evaluation
|
| 84 |
+
model.eval()
|
| 85 |
+
correct = 0
|
| 86 |
+
total = 0
|
| 87 |
+
with torch.no_grad():
|
| 88 |
+
for batch in val_loader:
|
| 89 |
+
inputs = {key: val.to(device) for key, val in batch.items()}
|
| 90 |
+
labels = inputs.pop("labels").to(device)
|
| 91 |
+
|
| 92 |
+
outputs = model(**inputs)
|
| 93 |
+
predictions = torch.argmax(outputs.logits, dim=1)
|
| 94 |
+
correct += (predictions == labels).sum().item()
|
| 95 |
+
total += labels.size(0)
|
| 96 |
+
|
| 97 |
+
accuracy = correct / total
|
| 98 |
+
print(f"Validation Accuracy: {accuracy:.4f}")
|
| 99 |
|
| 100 |
from sklearn.metrics import classification_report
|
| 101 |
from transformers import BertTokenizer
|
|
|
|
| 170 |
accuracy = evaluate_model_with_report(val_loader)
|
| 171 |
print(f"Model Validation Accuracy: {accuracy:.4f}")
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
## Gradio Interface
|
| 174 |
|
| 175 |
+
!pip install gradio
|
| 176 |
import gradio as gr
|
| 177 |
|
| 178 |
# Create Gradio Interface
|
|
|
|
| 242 |
# Launch the interface
|
| 243 |
interface = create_interface()
|
| 244 |
interface.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|