Hamdy005 commited on
Commit
626deb4
·
verified ·
1 Parent(s): 429f9b2

Upload 9 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Use official Python image
2
+ FROM python:3.12-slim
3
+
4
+ # 2. Set working directory
5
+ WORKDIR /app
6
+
7
+ # 3. Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # 4. Copy project files
13
+ COPY . /app
14
+
15
+ # 5. Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # 6. Download NLTK Data
19
+ RUN python -m nltk.downloader wordnet stopwords
20
+
21
+
22
+ # 7. Expose Streamlit Port
23
+ EXPOSE 8501
24
+
25
+ # 8. Run the Streamlit App
26
+ CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import streamlit as st
4
+ import re
5
+ import nltk
6
+ import contractions
7
+ from nltk.corpus import stopwords
8
+ from nltk.stem import WordNetLemmatizer
9
+ nltk.download('wordnet')
10
+ nltk.download('stopwords')
11
+
12
+ ## Setting Page Configuration and Header
13
+ st.set_page_config(
14
+ page_title="Spam Email Classifier",
15
+ page_icon="📧",
16
+ layout="wide",
17
+ )
18
+
19
+ st.title("📧 Spam Email Classifier")
20
+ st.write("Enter your email content below and the model will predict whether it is Spam or Ham (Not Spam).")
21
+
22
+
23
+ ## Preprocessing Function
24
+ def preprocess_text(text):
25
+
26
+ # Converting text to lowercase
27
+ text = text.lower()
28
+
29
+ # Removing Extra Spaces
30
+ text = re.sub(r'\s+', ' ', text).strip()
31
+
32
+ # Replacing Numbers with a Token
33
+ text = re.sub(r'\d+', '<NUM>', text)
34
+
35
+ # Normalize Elongated Words
36
+ text = re.sub(r'(.)\1+', r'\1\1', text)
37
+
38
+ # Expand Contractions (e.g.: weren't => were not)
39
+ text = contractions.fix(text)
40
+
41
+ # Removing Punctuations and Non-English Charachters
42
+ text = re.sub(r'[^a-z0-9\s]', '', text)
43
+
44
+ # Lemmatization
45
+ words = text.split()
46
+ lemmatizer = WordNetLemmatizer()
47
+ words = [lemmatizer.lemmatize(word) for word in words]
48
+
49
+ # Returning the Cleaned Text
50
+ cleaned_text = ' '.join(words)
51
+ return cleaned_text
52
+
53
+
54
+ ## Loading the Model and Vectorizer
55
+ with open('models/logistic_regression.pkl', "rb") as file:
56
+ model = pickle.load(file)
57
+
58
+ with open("models/vectorizer.pkl", "rb") as file:
59
+ vectorizer = pickle.load(file)
60
+
61
+
62
+ ## Prediction
63
+ email_text = st.text_area("Email Content:")
64
+
65
+ if st.button("Predict"):
66
+
67
+ if email_text:
68
+ processed_text = preprocess_text(email_text)
69
+ vect_text = vectorizer.transform([processed_text])
70
+
71
+ prediction = model.predict(vect_text)[0]
72
+ prediction_proba = model.predict_proba(vect_text)[0]
73
+
74
+ st.subheader("Prediction Result:")
75
+ if prediction == 1:
76
+ st.error("🚫 This email is Spam")
77
+ else:
78
+ st.success("✅ This email is Not Spam")
79
+
80
+ st.subheader("Prediction Probabilities:")
81
+ st.write(f"Ham: {prediction_proba[0]:.2f}, Spam: {prediction_proba[1]:.2f}")
82
+
83
+ else:
84
+ st.warning("Please enter email content to predict.")
models/logistic_regression.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05a739634741c73481ef0b0441469f0dad901a3b67d01284d11b681efa263862
3
+ size 1704165
models/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01fc04703cb257ef9d48ce94b8ae018bb2c0179e07f73091cf0e5ad8038c3675
3
+ size 4988967
notebooks/Logistic_Regression.ipynb ADDED
@@ -0,0 +1,1530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "17c590a6",
6
+ "metadata": {},
7
+ "source": [
8
+ "## 1. Reading and Exploring Data"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "fdbfdfe0",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import pandas as pd\n",
19
+ "import matplotlib.pyplot as plt"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "a122aae2",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/html": [
31
+ "<div>\n",
32
+ "<style scoped>\n",
33
+ " .dataframe tbody tr th:only-of-type {\n",
34
+ " vertical-align: middle;\n",
35
+ " }\n",
36
+ "\n",
37
+ " .dataframe tbody tr th {\n",
38
+ " vertical-align: top;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe thead th {\n",
42
+ " text-align: right;\n",
43
+ " }\n",
44
+ "</style>\n",
45
+ "<table border=\"1\" class=\"dataframe\">\n",
46
+ " <thead>\n",
47
+ " <tr style=\"text-align: right;\">\n",
48
+ " <th></th>\n",
49
+ " <th>label</th>\n",
50
+ " <th>text</th>\n",
51
+ " </tr>\n",
52
+ " </thead>\n",
53
+ " <tbody>\n",
54
+ " <tr>\n",
55
+ " <th>0</th>\n",
56
+ " <td>1</td>\n",
57
+ " <td>ounce feather bowl hummingbird opec moment ala...</td>\n",
58
+ " </tr>\n",
59
+ " <tr>\n",
60
+ " <th>1</th>\n",
61
+ " <td>1</td>\n",
62
+ " <td>wulvob get your medircations online qnb ikud v...</td>\n",
63
+ " </tr>\n",
64
+ " <tr>\n",
65
+ " <th>2</th>\n",
66
+ " <td>0</td>\n",
67
+ " <td>computer connection from cnn com wednesday es...</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>3</th>\n",
71
+ " <td>1</td>\n",
72
+ " <td>university degree obtain a prosperous future m...</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>4</th>\n",
76
+ " <td>0</td>\n",
77
+ " <td>thanks for all your answers guys i know i shou...</td>\n",
78
+ " </tr>\n",
79
+ " </tbody>\n",
80
+ "</table>\n",
81
+ "</div>"
82
+ ],
83
+ "text/plain": [
84
+ " label text\n",
85
+ "0 1 ounce feather bowl hummingbird opec moment ala...\n",
86
+ "1 1 wulvob get your medircations online qnb ikud v...\n",
87
+ "2 0 computer connection from cnn com wednesday es...\n",
88
+ "3 1 university degree obtain a prosperous future m...\n",
89
+ "4 0 thanks for all your answers guys i know i shou..."
90
+ ]
91
+ },
92
+ "execution_count": 2,
93
+ "metadata": {},
94
+ "output_type": "execute_result"
95
+ }
96
+ ],
97
+ "source": [
98
+ "df = pd.read_csv('../data/raw/data.csv')\n",
99
+ "df.head()"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 3,
105
+ "id": "713d51b3",
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "<class 'pandas.core.frame.DataFrame'>\n",
113
+ "RangeIndex: 83448 entries, 0 to 83447\n",
114
+ "Data columns (total 2 columns):\n",
115
+ " # Column Non-Null Count Dtype \n",
116
+ "--- ------ -------------- ----- \n",
117
+ " 0 label 83448 non-null int64 \n",
118
+ " 1 text 83448 non-null object\n",
119
+ "dtypes: int64(1), object(1)\n",
120
+ "memory usage: 1.3+ MB\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "df.info()"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 4,
131
+ "id": "9a2f29b3",
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "name": "stdout",
136
+ "output_type": "stream",
137
+ "text": [
138
+ "Number of Null Values = 0\n",
139
+ "Number of Duplicated Rows = 0\n"
140
+ ]
141
+ }
142
+ ],
143
+ "source": [
144
+ "nulls = sum(df.isnull().sum())\n",
145
+ "duplicated = sum(df.duplicated())\n",
146
+ "\n",
147
+ "print(f\"Number of Null Values = {nulls}\")\n",
148
+ "print(f\"Number of Duplicated Rows = {duplicated}\")"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 5,
154
+ "id": "78bc23cd",
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "text/plain": [
160
+ "Not Spam 39538\n",
161
+ "Spam 43910\n",
162
+ "Name: count, dtype: int64"
163
+ ]
164
+ },
165
+ "execution_count": 5,
166
+ "metadata": {},
167
+ "output_type": "execute_result"
168
+ }
169
+ ],
170
+ "source": [
171
+ "# Checking Dataset Balance\n",
172
+ "value_counts = df['label'].value_counts().sort_index()\n",
173
+ "value_counts.index = ['Not Spam', 'Spam']\n",
174
+ "\n",
175
+ "value_counts"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 6,
181
+ "id": "3f8e082e",
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "data": {
186
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAHqCAYAAACZcdjsAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjcsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvTLEjVAAAAAlwSFlzAAAPYQAAD2EBqD+naQAANxVJREFUeJzt3Xl0VOXh//HPJJAFkgmEJTElEDaBKEsJW6AqlMioIFKhkkoxLFpBoEAoIBXCojaClcUC0tZKqAVZ/BZUkKAEWb4QQUKDhBK0LQo2ZhFIBoIkkNzfH34zP8awSJ7AJPB+nTPnMPc+c+9z5xwjb27uvTbLsiwBAAAAgAEvT08AAAAAQPVHWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAACuyxdffCGbzaakpCRPTwUAUIUQFgBwC+vfv79q1aqlM2fOXHHMkCFD5OPjo5MnT97EmZlJT0/XL3/5S4WHh8vX11fBwcGKiYnR8uXLVVJS4unpSZJ+97vfacOGDZ6eBgDcNIQFANzChgwZom+//Vbr16+/7Ppz587pnXfe0QMPPKB69erd5NlVzOuvv65OnTrpo48+0pAhQ7R06VIlJCTI399fI0eO1Ny5cz09RUmEBYDbTw1PTwAAcOP0799fgYGBWrVqlZ544oly69955x0VFhZqyJAhHpjd9fv44481atQoRUdH6/3331dgYKBr3YQJE7R//35lZGR4cIYAcPvijAUA3ML8/f316KOPKiUlRbm5ueXWr1q1SoGBgerfv79OnTql3/zmN2rbtq0CAgJkt9v14IMP6uDBg9fcT8+ePdWzZ89yy4cNG6aIiAi3ZaWlpVq4cKHuuusu+fn5KSQkRE8//bROnz59zf3Mnj1bNptNK1eudIuKMp06ddKwYcNc7wsLCzVp0iTXr0y1atVKv//972VZlmvM1a4ZsdlsmjVrluv9rFmzZLPZ9K9//UvDhg1TnTp1FBQUpOHDh+vcuXNunyssLNSKFStks9lks9lc8zpz5owmTJigiIgI+fr6qmHDhrr//vt14MCBax4/AFRlhAUA3OKGDBmiixcvau3atW7LT506pS1btuhnP/uZ/P399Z///EcbNmxQv379NH/+fE2ePFmHDh3Sfffdp6ysrEqbz9NPP63JkyerR48eWrRokYYPH66VK1fK4XDowoULV/zcuXPnlJKSonvvvVeNGze+5n4sy1L//v21YMECPfDAA5o/f75atWqlyZMnKz4+3ugYHnvsMZ05c0aJiYl67LHHlJSUpNmzZ7vWv/nmm/L19dU999yjN998U2+++aaefvppSdKoUaP02muvaeDAgVq6dKl+85vfyN/fX0eOHDGaEwB4nAUAuKVdvHjRuuOOO6zo6Gi35cuWLbMkWVu2bLEsy7LOnz9vlZSUuI05duyY5evra82ZM8dtmSRr+fLlrmX33Xefdd9995Xbd1xcnNWkSRPX+127dlmSrJUrV7qNS05OvuzySx08eNCSZI0fP/4aR/ydDRs2WJKsF154wW35oEGDLJvNZv3rX/+64vGUkWTNnDnT9X7mzJmWJGvEiBFu4372s59Z9erVc1tWu3ZtKy4urtw2g4KCrDFjxvygYwCA6oQzFgBwi/P29lZsbKxSU1P1xRdfuJavWrVKISEh6t27tyTJ19dXXl7f/W+hpKREJ0+eVEBAgFq1alVpv6azbt06BQUF6f7779c333zjekVFRSkgIEAfffTRFT/rdDol6bK/AnU577//vry9vfXrX//abfmkSZNkWZY2b95c4eMYNWqU2/t77rlHJ0+edM3xaurUqaO9e/dW6lkgAKgKCAsAuA2UXZy9atUqSdJXX32lXbt2KTY2Vt7e3pK+u/ZhwYIFatmypXx9fVW/fn01aNBAn376qQoKCiplHp9//rkKCgrUsGFDNWjQwO119uzZy14HUsZut0vSVW+de6kvv/xSYWFh5UKkTZs2rvUV9f1fxapbt64k/aDrRObNm6eMjAyFh4erS5cumjVrlv7zn/9UeC4AUFVwVygAuA1ERUWpdevWeuutt/Tb3/5Wb731lizLcrsb1O9+9zvNmDFDI0aM0PPPP6/g4GB5eXlpwoQJKi0tver2bTab2wXRZb7/TInS0lI1bNhQK1euvOx2GjRocMV9tGjRQjVq1NChQ4euOpfrZbPZLrv8as/DKIux77vcd/B9jz32mO655x6tX79eH3zwgV5++WXNnTtXf//73/Xggw/+sEkDQBVEWADAbWLIkCGaMWOGPv30U61atUotW7ZU586dXevffvtt9erVS3/5y1/cPpefn6/69etfddt169a97L+6f/+sQPPmzbV161b16NFD/v7+1zX/WrVq6ac//am2bdumEydOKDw8/KrjmzRpoq1bt+rMmTNuZy0yMzNd68vmLn13nFeb+/W6UrBI0h133KFnnnlGzzzzjHJzc9WxY0e9+OKLhAWAao1fhQKA20TZ2YmEhASlp6eXe3aFt7d3uX9xX7dunf773/9ec9vNmzdXZmam8vLyXMsOHjyo3bt3u4177LHHVFJSoueff77cNi5evFjuL/ffN3PmTFmWpaFDh+rs2bPl1qelpWnFihWSpIceekglJSVavHix25gFCxbIZrO5/hJvt9tVv3597dy5023c0qVLrzqXa6ldu3a54ykpKSn3a2UNGzZUWFiYioqKjPYHAJ7GGQsAuE00bdpU3bt31zvvvCNJ5cKiX79+mjNnjoYPH67u3bvr0KFDWrlypZo1a3bNbY8YMULz58+Xw+HQyJEjlZubq2XLlumuu+5yu6D5vvvu09NPP63ExESlp6erT58+qlmzpj7//HOtW7dOixYt0qBBg664n+7du2vJkiV65pln1Lp1aw0dOlQtW7bUmTNntH37dr377rt64YUXJEkPP/ywevXqpeeee05ffPGF2rdvrw8++EDvvPOOJkyYoObNm7u2++STT+qll17Sk08+qU6dOmnnzp367LPPruv7/b6oqCht3bpV8+fPV1hYmJo2bapWrVqpUaNGGjRokNq3b6+AgABt3bpVn3zyiV555RWj/QGAx3nyllQAgJtryZIlliSrS5cu5dadP3/emjRpknXHHXdY/v7+Vo8ePazU1NRyt5K90u1Z//a3v1nNmjWzfHx8rA4dOlhbtmwpd7vZMn/605+sqKgoy9/f3woMDLTatm1rTZkyxcrKyvpBx5GWlmY9/vjjVlhYmFWzZk2rbt26Vu/eva0VK1a43TL3zJkz1sSJE13jWrZsab388stWaWmp2/bOnTtnjRw50goKCrICAwOtxx57zMrNzb3i7Wbz8vLcPr98+XJLknXs2DHXsszMTOvee++1/P39LUlWXFycVVRUZE2ePNlq3769FRgYaNWuXdtq3769tXTp0h903ABQldks6wdcaQYAAAAAV8E1FgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIzxgLxKUlpaqqysLAUGBspms3l6OgAAAIAxy7J05swZhYWFycvr6uckCItKkpWVpfDwcE9PAwAAAKh0J06cUKNGja46hrCoJIGBgZK++9LtdruHZwMAAACYczqdCg8Pd/1d92oIi0pS9utPdrudsAAAAMAt5Yf8qj8XbwMAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMEZYAAAAADBGWAAAAAAwRlgAAAAAMFbD0xMAAKC6+3eDezw9BQC3sOZ5uzw9hR+EMxYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjBEWAAAAAIwRFgAAAACMERYAAAAAjFWZsHjppZdks9k0YcIE17Lz589rzJgxqlevngICAjRw4EDl5OS4fe748ePq27evatWqpYYNG2ry5Mm6ePGi25jt27erY8eO8vX1VYsWLZSUlFRu/0uWLFFERIT8/PzUtWtX7du370YcJgAAAHBLqhJh8cknn+iPf/yj2rVr57Z84sSJeu+997Ru3Trt2LFDWVlZevTRR13rS0pK1LdvXxUXF2vPnj1asWKFkpKSlJCQ4Bpz7Ngx9e3bV7169VJ6eromTJigJ598Ulu2bHGNWbNmjeLj4zVz5kwdOHBA7du3l8PhUG5u7o0/eAAAAOAWYLMsy/LkBM6ePauOHTtq6dKleuGFF9ShQwctXLhQBQUFatCggVatWqVBgwZJkjIzM9WmTRulpqaqW7du2rx5s/r166esrCyFhIRIkpYtW6apU6cqLy9PPj4+mjp1qjZt2qSMjAzXPmNjY5Wfn6/k5GRJUteuXdW5c2ctXrxYklRaWqrw8HCNGzdOzz777A86DqfTqaCgIBUUFMhut1fmVwQAqOL+3eAeT08BwC2sed4uj+37ev6O6/EzFmPGjFHfvn0VExPjtjwtLU0XLlxwW966dWs1btxYqampkqTU1FS1bdvWFRWS5HA45HQ6dfjwYdeY72/b4XC4tlFcXKy0tDS3MV5eXoqJiXGNAQAAAHB1NTy589WrV+vAgQP65JNPyq3Lzs6Wj4+P6tSp47Y8JCRE2dnZrjGXRkXZ+rJ1VxvjdDr17bff6vTp0yopKbnsmMzMzCvOvaioSEVFRa73TqfzGkcLAAAA3Lo8dsbixIkTGj9+vFauXCk/Pz9PTaPCEhMTFRQU5HqFh4d7ekoAAACAx3gsLNLS0pSbm6uOHTuqRo0aqlGjhnbs2KFXX31VNWrUUEhIiIqLi5Wfn+/2uZycHIWGhkqSQkNDy90lquz9tcbY7Xb5+/urfv368vb2vuyYsm1czrRp01RQUOB6nThxokLfAwAAAHAr8FhY9O7dW4cOHVJ6errr1alTJw0ZMsT155o1ayolJcX1maNHj+r48eOKjo6WJEVHR+vQoUNud2/68MMPZbfbFRkZ6Rpz6TbKxpRtw8fHR1FRUW5jSktLlZKS4hpzOb6+vrLb7W4vAAAA4HblsWssAgMDdffdd7stq127turVq+daPnLkSMXHxys4OFh2u13jxo1TdHS0unXrJknq06ePIiMjNXToUM2bN0/Z2dmaPn26xowZI19fX0nSqFGjtHjxYk2ZMkUjRozQtm3btHbtWm3atMm13/j4eMXFxalTp07q0qWLFi5cqMLCQg0fPvwmfRsAAABA9ebRi7evZcGCBfLy8tLAgQNVVFQkh8OhpUuXutZ7e3tr48aNGj16tKKjo1W7dm3FxcVpzpw5rjFNmzbVpk2bNHHiRC1atEiNGjXS66+/LofD4RozePBg5eXlKSEhQdnZ2erQoYOSk5PLXdBd1TVZ5OkZALiVfTne0zMAAFRlHn+Oxa2iKjzHgrAAcCMRFlfGcywA3Eg8xwIAAADAbYOwAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGDMo2Hx2muvqV27drLb7bLb7YqOjtbmzZtd68+fP68xY8aoXr16CggI0MCBA5WTk+O2jePHj6tv376qVauWGjZsqMmTJ+vixYtuY7Zv366OHTvK19dXLVq0UFJSUrm5LFmyRBEREfLz81PXrl21b9++G3LMAAAAwK3Io2HRqFEjvfTSS0pLS9P+/fv105/+VI888ogOHz4sSZo4caLee+89rVu3Tjt27FBWVpYeffRR1+dLSkrUt29fFRcXa8+ePVqxYoWSkpKUkJDgGnPs2DH17dtXvXr1Unp6uiZMmKAnn3xSW7ZscY1Zs2aN4uPjNXPmTB04cEDt27eXw+FQbm7uzfsyAAAAgGrMZlmW5elJXCo4OFgvv/yyBg0apAYNGmjVqlUaNGiQJCkzM1Nt2rRRamqqunXrps2bN6tfv37KyspSSEiIJGnZsmWaOnWq8vLy5OPjo6lTp2rTpk3KyMhw7SM2Nlb5+flKTk6WJHXt2lWdO3fW4sWLJUmlpaUKDw/XuHHj9Oyzz/6geTudTgUFBamgoEB2u70yv5IfrMkij+wWwG3iy/GenkHV9e8G93h6CgBuYc3zdnls39fzd9wqc41FSUmJVq9ercLCQkVHRystLU0XLlxQTEyMa0zr1q3VuHFjpaamSpJSU1PVtm1bV1RIksPhkNPpdJ31SE1NddtG2ZiybRQXFystLc1tjJeXl2JiYlxjAAAAAFxdDU9P4NChQ4qOjtb58+cVEBCg9evXKzIyUunp6fLx8VGdOnXcxoeEhCg7O1uSlJ2d7RYVZevL1l1tjNPp1LfffqvTp0+rpKTksmMyMzOvOO+ioiIVFRW53judzus7cAAAAOAW4vEzFq1atVJ6err27t2r0aNHKy4uTv/85z89Pa1rSkxMVFBQkOsVHh7u6SkBAAAAHuPxsPDx8VGLFi0UFRWlxMREtW/fXosWLVJoaKiKi4uVn5/vNj4nJ0ehoaGSpNDQ0HJ3iSp7f60xdrtd/v7+ql+/vry9vS87pmwblzNt2jQVFBS4XidOnKjQ8QMAAAC3Ao+HxfeVlpaqqKhIUVFRqlmzplJSUlzrjh49quPHjys6OlqSFB0drUOHDrndvenDDz+U3W5XZGSka8yl2ygbU7YNHx8fRUVFuY0pLS1VSkqKa8zl+Pr6um6TW/YCAAAAblcevcZi2rRpevDBB9W4cWOdOXNGq1at0vbt27VlyxYFBQVp5MiRio+PV3BwsOx2u8aNG6fo6Gh169ZNktSnTx9FRkZq6NChmjdvnrKzszV9+nSNGTNGvr6+kqRRo0Zp8eLFmjJlikaMGKFt27Zp7dq12rRpk2se8fHxiouLU6dOndSlSxctXLhQhYWFGj58uEe+FwAAAKC68WhY5Obm6oknntDXX3+toKAgtWvXTlu2bNH9998vSVqwYIG8vLw0cOBAFRUVyeFwaOnSpa7Pe3t7a+PGjRo9erSio6NVu3ZtxcXFac6cOa4xTZs21aZNmzRx4kQtWrRIjRo10uuvvy6Hw+EaM3jwYOXl5SkhIUHZ2dnq0KGDkpOTy13QDQAAAODyqtxzLKornmMB4FbHcyyujOdYALiReI4FAAAAgNsGYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAAAAwFiFwqJZs2Y6efJkueX5+flq1qyZ8aQAAAAAVC8VCosvvvhCJSUl5ZYXFRXpv//9r/GkAAAAAFQvNa5n8Lvvvuv685YtWxQUFOR6X1JSopSUFEVERFTa5AAAAABUD9cVFgMGDJAk2Ww2xcXFua2rWbOmIiIi9Morr1Ta5AAAAABUD9cVFqWlpZKkpk2b6pNPPlH9+vVvyKQAAAAAVC/XFRZljh07VtnzAAAAAFCNVSgsJCklJUUpKSnKzc11ncko88YbbxhPDAAAAED1UaGwmD17tubMmaNOnTrpjjvukM1mq+x5AQAAAKhGKhQWy5YtU1JSkoYOHVrZ8wEAAABQDVXoORbFxcXq3r17Zc8FAAAAQDVVobB48skntWrVqsqeCwAAAIBqqkK/CnX+/Hn96U9/0tatW9WuXTvVrFnTbf38+fMrZXIAAAAAqocKhcWnn36qDh06SJIyMjLc1nEhNwAAAHD7qVBYfPTRR5U9DwAAAADVWIWusQAAAACAS1XojEWvXr2u+itP27Ztq/CEAAAAAFQ/FQqLsusryly4cEHp6enKyMhQXFxcZcwLAAAAQDVSobBYsGDBZZfPmjVLZ8+eNZoQAAAAgOqnUq+x+OUvf6k33nijMjcJAAAAoBqo1LBITU2Vn59fZW4SAAAAQDVQoV+FevTRR93eW5alr7/+Wvv379eMGTMqZWIAAAAAqo8KhUVQUJDbey8vL7Vq1Upz5sxRnz59KmViAAAAAKqPCoXF8uXLK3seAAAAAKqxCoVFmbS0NB05ckSSdNddd+nHP/5xpUwKAAAAQPVSobDIzc1VbGystm/frjp16kiS8vPz1atXL61evVoNGjSozDkCAAAAqOIqdFeocePG6cyZMzp8+LBOnTqlU6dOKSMjQ06nU7/+9a8re44AAAAAqrgKnbFITk7W1q1b1aZNG9eyyMhILVmyhIu3AQAAgNtQhc5YlJaWqmbNmuWW16xZU6WlpcaTAgAAAFC9VCgsfvrTn2r8+PHKyspyLfvvf/+riRMnqnfv3pU2OQAAAADVQ4XCYvHixXI6nYqIiFDz5s3VvHlzNW3aVE6nU3/4wx8qe44AAAAAqrgKXWMRHh6uAwcOaOvWrcrMzJQktWnTRjExMZU6OQAAAADVw3Wdsdi2bZsiIyPldDpls9l0//33a9y4cRo3bpw6d+6su+66S7t27bpRcwUAAABQRV1XWCxcuFBPPfWU7HZ7uXVBQUF6+umnNX/+/EqbHAAAAIDq4brC4uDBg3rggQeuuL5Pnz5KS0sznhQAAACA6uW6wiInJ+eyt5ktU6NGDeXl5RlPCgAAAED1cl1h8aMf/UgZGRlXXP/pp5/qjjvuMJ4UAAAAgOrlusLioYce0owZM3T+/Ply67799lvNnDlT/fr1q7TJAQAAAKgerut2s9OnT9ff//533XnnnRo7dqxatWolScrMzNSSJUtUUlKi55577oZMFAAAAEDVdV1hERISoj179mj06NGaNm2aLMuSJNlsNjkcDi1ZskQhISE3ZKIAAAAAqq7rfkBekyZN9P777+v06dP617/+Jcuy1LJlS9WtW/dGzA8AAABANVChJ29LUt26ddW5c+fKnAsAAACAauq6Lt6ubImJiercubMCAwPVsGFDDRgwQEePHnUbc/78eY0ZM0b16tVTQECABg4cqJycHLcxx48fV9++fVWrVi01bNhQkydP1sWLF93GbN++XR07dpSvr69atGihpKSkcvNZsmSJIiIi5Ofnp65du2rfvn2VfswAAADArcijYbFjxw6NGTNGH3/8sT788ENduHBBffr0UWFhoWvMxIkT9d5772ndunXasWOHsrKy9Oijj7rWl5SUqG/fviouLtaePXu0YsUKJSUlKSEhwTXm2LFj6tu3r3r16qX09HRNmDBBTz75pLZs2eIas2bNGsXHx2vmzJk6cOCA2rdvL4fDodzc3JvzZQAAAADVmM0quwK7CsjLy1PDhg21Y8cO3XvvvSooKFCDBg20atUqDRo0SNJ3d6Bq06aNUlNT1a1bN23evFn9+vVTVlaW68LxZcuWaerUqcrLy5OPj4+mTp2qTZs2uT2DIzY2Vvn5+UpOTpYkde3aVZ07d9bixYslSaWlpQoPD9e4ceP07LPPXnPuTqdTQUFBKigokN1ur+yv5gdpssgjuwVwm/hyvKdnUHX9u8E9np4CgFtY87xdHtv39fwd16NnLL6voKBAkhQcHCxJSktL04ULFxQTE+Ma07p1azVu3FipqamSpNTUVLVt29btblQOh0NOp1OHDx92jbl0G2VjyrZRXFystLQ0tzFeXl6KiYlxjfm+oqIiOZ1OtxcAAABwu6oyYVFaWqoJEyaoR48euvvuuyVJ2dnZ8vHxUZ06ddzGhoSEKDs72zXm+7e4LXt/rTFOp1PffvutvvnmG5WUlFx2TNk2vi8xMVFBQUGuV3h4eMUOHAAAALgFVJmwGDNmjDIyMrR69WpPT+UHmTZtmgoKClyvEydOeHpKAAAAgMdU+HazlWns2LHauHGjdu7cqUaNGrmWh4aGqri4WPn5+W5nLXJychQaGuoa8/27N5XdNerSMd+/k1ROTo7sdrv8/f3l7e0tb2/vy44p28b3+fr6ytfXt2IHDAAAANxiPHrGwrIsjR07VuvXr9e2bdvUtGlTt/VRUVGqWbOmUlJSXMuOHj2q48ePKzo6WpIUHR2tQ4cOud296cMPP5TdbldkZKRrzKXbKBtTtg0fHx9FRUW5jSktLVVKSoprDAAAAIAr8+gZizFjxmjVqlV65513FBgY6LqeISgoSP7+/goKCtLIkSMVHx+v4OBg2e12jRs3TtHR0erWrZskqU+fPoqMjNTQoUM1b948ZWdna/r06RozZozrjMKoUaO0ePFiTZkyRSNGjNC2bdu0du1abdq0yTWX+Ph4xcXFqVOnTurSpYsWLlyowsJCDR8+/OZ/MQAAAEA149GweO211yRJPXv2dFu+fPlyDRs2TJK0YMECeXl5aeDAgSoqKpLD4dDSpUtdY729vbVx40aNHj1a0dHRql27tuLi4jRnzhzXmKZNm2rTpk2aOHGiFi1apEaNGun111+Xw+FwjRk8eLDy8vKUkJCg7OxsdejQQcnJyeUu6AYAAABQXpV6jkV1xnMsANzqeI7FlfEcCwA3Es+xAAAAAHDbICwAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGPNoWOzcuVMPP/ywwsLCZLPZtGHDBrf1lmUpISFBd9xxh/z9/RUTE6PPP//cbcypU6c0ZMgQ2e121alTRyNHjtTZs2fdxnz66ae655575Ofnp/DwcM2bN6/cXNatW6fWrVvLz89Pbdu21fvvv1/pxwsAAADcqjwaFoWFhWrfvr2WLFly2fXz5s3Tq6++qmXLlmnv3r2qXbu2HA6Hzp8/7xozZMgQHT58WB9++KE2btyonTt36le/+pVrvdPpVJ8+fdSkSROlpaXp5Zdf1qxZs/SnP/3JNWbPnj36xS9+oZEjR+of//iHBgwYoAEDBigjI+PGHTwAAABwC7FZlmV5ehKSZLPZtH79eg0YMEDSd2crwsLCNGnSJP3mN7+RJBUUFCgkJERJSUmKjY3VkSNHFBkZqU8++USdOnWSJCUnJ+uhhx7SV199pbCwML322mt67rnnlJ2dLR8fH0nSs88+qw0bNigzM1OSNHjwYBUWFmrjxo2u+XTr1k0dOnTQsmXLftD8nU6ngoKCVFBQILvdXllfy3VpssgjuwVwm/hyvKdnUHX9u8E9np4CgFtY87xdHtv39fwdt8peY3Hs2DFlZ2crJibGtSwoKEhdu3ZVamqqJCk1NVV16tRxRYUkxcTEyMvLS3v37nWNuffee11RIUkOh0NHjx7V6dOnXWMu3U/ZmLL9XE5RUZGcTqfbCwAAALhdVdmwyM7OliSFhIS4LQ8JCXGty87OVsOGDd3W16hRQ8HBwW5jLreNS/dxpTFl6y8nMTFRQUFBrld4ePj1HiIAAABwy6iyYVHVTZs2TQUFBa7XiRMnPD0lAAAAwGOqbFiEhoZKknJyctyW5+TkuNaFhoYqNzfXbf3Fixd16tQptzGX28al+7jSmLL1l+Pr6yu73e72AgAAAG5XVTYsmjZtqtDQUKWkpLiWOZ1O7d27V9HR0ZKk6Oho5efnKy0tzTVm27ZtKi0tVdeuXV1jdu7cqQsXLrjGfPjhh2rVqpXq1q3rGnPpfsrGlO0HAAAAwNV5NCzOnj2r9PR0paenS/rugu309HQdP35cNptNEyZM0AsvvKB3331Xhw4d0hNPPKGwsDDXnaPatGmjBx54QE899ZT27dun3bt3a+zYsYqNjVVYWJgk6fHHH5ePj49Gjhypw4cPa82aNVq0aJHi4+Nd8xg/frySk5P1yiuvKDMzU7NmzdL+/fs1duzYm/2VAAAAANVSDU/ufP/+/erVq5frfdlf9uPi4pSUlKQpU6aosLBQv/rVr5Sfn6+f/OQnSk5Olp+fn+szK1eu1NixY9W7d295eXlp4MCBevXVV13rg4KC9MEHH2jMmDGKiopS/fr1lZCQ4Pasi+7du2vVqlWaPn26fvvb36ply5basGGD7r777pvwLQAAAADVX5V5jkV1x3MsANzqeI7FlfEcCwA3Es+xAAAAAHDbICwAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIyy+Z8mSJYqIiJCfn5+6du2qffv2eXpKAAAAQJVHWFxizZo1io+P18yZM3XgwAG1b99eDodDubm5np4aAAAAUKURFpeYP3++nnrqKQ0fPlyRkZFatmyZatWqpTfeeMPTUwMAAACqNMLi/xQXFystLU0xMTGuZV5eXoqJiVFqaqoHZwYAAABUfTU8PYGq4ptvvlFJSYlCQkLcloeEhCgzM7Pc+KKiIhUVFbneFxQUSJKcTueNnehVlJ732K4B3AY8+OOtyjtTetHTUwBwC/Pk3y/L9m1Z1jXHEhYVlJiYqNmzZ5dbHh4e7oHZAMCNF/Ssp2cAALepoCBPz0BnzpxR0DXmQVj8n/r168vb21s5OTluy3NychQaGlpu/LRp0xQfH+96X1paqlOnTqlevXqy2Ww3fL6AKafTqfDwcJ04cUJ2u93T0wGA2wY/f1GdWJalM2fOKCws7JpjCYv/4+Pjo6ioKKWkpGjAgAGSvouFlJQUjR07ttx4X19f+fr6ui2rU6fOTZgpULnsdjv/YwMAD+DnL6qLa52pKENYXCI+Pl5xcXHq1KmTunTpooULF6qwsFDDhw/39NQAAACAKo2wuMTgwYOVl5enhIQEZWdnq0OHDkpOTi53QTcAAAAAd4TF94wdO/ayv/oE3Gp8fX01c+bMcr/SBwC4sfj5i1uVzfoh944CAAAAgKvgAXkAAAAAjBEWAAAAAIwRFgAAAACMERaABwwbNkw2m00vvfSS2/INGzZc9wMWIyIitHDhwmuOO3jwoPr376+GDRvKz89PERERGjx4sHJzc69rfwBwO8vLy9Po0aPVuHFj+fr6KjQ0VA6HQ7t37/b01ACPIywAD/Hz89PcuXN1+vTpG76vvLw89e7dW8HBwdqyZYuOHDmi5cuXKywsTIWFhTd8/wBwqxg4cKD+8Y9/aMWKFfrss8/07rvvqmfPnjp58qSnpwZ4HGEBeEhMTIxCQ0OVmJh41XH/8z//o7vuuku+vr6KiIjQK6+84lrXs2dPffnll5o4caJsNtsVz3bs3r1bBQUFev311/XjH/9YTZs2Va9evbRgwQI1bdpUkrR9+3bZbDZt2rRJ7dq1k5+fn7p166aMjAzXdk6ePKlf/OIX+tGPfqRatWqpbdu2euutt9z21bNnT40bN04TJkxQ3bp1FRISoj//+c+uh00GBgaqRYsW2rx5c0W/OgDwiPz8fO3atUtz585Vr1691KRJE3Xp0kXTpk1T//79JUk2m02vvfaaHnzwQfn7+6tZs2Z6++233bYzdepU3XnnnapVq5aaNWumGTNm6MKFC671s2bNUocOHfTGG2+ocePGCggI0DPPPKOSkhLNmzdPoaGhatiwoV588cWbevzAtRAWgId4e3vrd7/7nf7whz/oq6++uuyYtLQ0PfbYY4qNjdWhQ4c0a9YszZgxQ0lJSZKkv//972rUqJHmzJmjr7/+Wl9//fVltxMaGqqLFy9q/fr1utYdpidPnqxXXnlFn3zyiRo0aKCHH37Y9T+88+fPKyoqSps2bVJGRoZ+9atfaejQodq3b5/bNlasWKH69etr3759GjdunEaPHq2f//zn6t69uw4cOKA+ffpo6NChOnfu3HV+awDgOQEBAQoICNCGDRtUVFR0xXEzZszQwIEDdfDgQQ0ZMkSxsbE6cuSIa31gYKCSkpL0z3/+U4sWLdKf//xnLViwwG0b//73v7V582YlJyfrrbfe0l/+8hf17dtXX331lXbs2KG5c+dq+vTp2rt37w07XuC6WQBuuri4OOuRRx6xLMuyunXrZo0YMcKyLMtav369del/lo8//rh1//33u3128uTJVmRkpOt9kyZNrAULFlxzn7/97W+tGjVqWMHBwdYDDzxgzZs3z8rOznat/+ijjyxJ1urVq13LTp48afn7+1tr1qy54nb79u1rTZo0yfX+vvvus37yk5+43l+8eNGqXbu2NXToUNeyr7/+2pJkpaamXnPeAFCVvP3221bdunUtPz8/q3v37ta0adOsgwcPutZLskaNGuX2ma5du1qjR4++4jZffvllKyoqyvV+5syZVq1atSyn0+la5nA4rIiICKukpMS1rFWrVlZiYmJlHBZQKThjAXjY3LlztWLFCrd/zSpz5MgR9ejRw21Zjx499Pnnn6ukpOS69vPiiy8qOztby5Yt01133aVly5apdevWOnTokNu46Oho15+Dg4PVqlUr19xKSkr0/PPPq23btgoODlZAQIC2bNmi48ePu22jXbt2rj97e3urXr16atu2rWtZSEiIJHHhOIBqZ+DAgcrKytK7776rBx54QNu3b1fHjh1dZ5Il95+jZe8v/Rm/Zs0a9ejRQ6GhoQoICND06dPL/RyNiIhQYGCg631ISIgiIyPl5eXltoyfo6hKCAvAw+699145HA5Nmzbthu+rXr16+vnPf67f//73OnLkiMLCwvT73//+B3/+5Zdf1qJFizR16lR99NFHSk9Pl8PhUHFxsdu4mjVrur232Wxuy8quBSktLTU4GgDwDD8/P91///2aMWOG9uzZo2HDhmnmzJk/6LOpqakaMmSIHnroIW3cuFH/+Mc/9Nxzz133z9GyZfwcRVVCWABVwEsvvaT33ntPqampbsvbtGlT7haGu3fv1p133ilvb29Jko+Pz3WfvSj7XPPmzcvdFerjjz92/fn06dP67LPP1KZNG9e+H3nkEf3yl79U+/bt1axZM3322WfXvW8AuJVERka6/Sy99Odo2fuyn6N79uxRkyZN9Nxzz6lTp05q2bKlvvzyy5s6X+BGqeHpCQCQ2rZtqyFDhujVV191Wz5p0iR17txZzz//vAYPHqzU1FQtXrxYS5cudY2JiIjQzp07FRsbK19fX9WvX7/c9jdu3KjVq1crNjZWd955pyzL0nvvvaf3339fy5cvdxs7Z84c1atXTyEhIXruuedUv359DRgwQJLUsmVLvf3229qzZ4/q1q2r+fPnKycnR5GRkZX/pQBAFXPy5En9/Oc/14gRI9SuXTsFBgZq//79mjdvnh555BHXuHXr1qlTp076yU9+opUrV2rfvn36y1/+Ium7n6PHjx/X6tWr1blzZ23atEnr16/31CEBlYozFkAVMWfOnHKntDt27Ki1a9dq9erVuvvuu5WQkKA5c+Zo2LBhbp/74osv1Lx5czVo0OCy246MjFStWrU0adIkdejQQd26ddPatWv1+uuva+jQoW5jX3rpJY0fP15RUVHKzs7We++9Jx8fH0nS9OnT1bFjRzkcDvXs2VOhoaGu6ACAW11AQIC6du2qBQsW6N5779Xdd9+tGTNm6KmnntLixYtd42bPnq3Vq1erXbt2+utf/6q33nrL9Q8w/fv318SJEzV27Fh16NBBe/bs0YwZMzx1SEClslnWNe49CeC2sH37dvXq1UunT59WnTp1PD0dAKiWbDab1q9fzz+64LbEGQsAAAAAxggLAAAAAMb4VSgAAAAAxjhjAQAAAMAYYQEAAADAGGEBAAAAwBhhAQAAAMAYYQEAAADAGGEBAKg2kpKSKuUBjjabTRs2bDDeDgDg/yMsAAA31bBhw3gqMQDcgggLAAAAAMYICwBAlTF//ny1bdtWtWvXVnh4uJ555hmdPXu23LgNGzaoZcuW8vPzk8Ph0IkTJ9zWv/POO+rYsaP8/PzUrFkzzZ49WxcvXrxZhwEAtyXCAgBQZXh5eenVV1/V4cOHtWLFCm3btk1TpkxxG3Pu3Dm9+OKL+utf/6rdu3crPz9fsbGxrvW7du3SE088ofHjx+uf//yn/vjHPyopKUkvvvjizT4cALit2CzLsjw9CQDA7WPYsGHKz8//QRdPv/322xo1apS++eYbSd9dvD18+HB9/PHH6tq1qyQpMzNTbdq00d69e9WlSxfFxMSod+/emjZtmms7f/vb3zRlyhRlZWVJ+u7i7fXr13OtBwBUohqengAAAGW2bt2qxMREZWZmyul06uLFizp//rzOnTunWrVqSZJq1Kihzp07uz7TunVr1alTR0eOHFGXLl108OBB7d692+0MRUlJSbntAAAqF2EBAKgSvvjiC/Xr10+jR4/Wiy++qODgYP3v//6vRo4cqeLi4h8cBGfPntXs2bP16KOPllvn5+dX2dMGAPwfwgIAUCWkpaWptLRUr7zyiry8vrsEcO3ateXGXbx4Ufv371eXLl0kSUePHlV+fr7atGkjSerYsaOOHj2qFi1a3LzJAwAICwDAzVdQUKD09HS3ZfXr19eFCxf0hz/8QQ8//LB2796tZcuWlftszZo1NW7cOL366quqUaOGxo4dq27durlCIyEhQf369VPjxo01aNAgeXl56eDBg8rIyNALL7xwMw4PAG5L3BUKAHDTbd++XT/+8Y/dXm+++abmz5+vuXPn6u6779bKlSuVmJhY7rO1atXS1KlT9fjjj6tHjx4KCAjQmjVrXOsdDoc2btyoDz74QJ07d1a3bt20YMECNWnS5GYeIgDcdrgrFAAAAABjnLEAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABgjLAAAAAAYIywAAAAAGCMsAAAAABg7P8BsKsiB2UlMZ4AAAAASUVORK5CYII=",
187
+ "text/plain": [
188
+ "<Figure size 800x500 with 1 Axes>"
189
+ ]
190
+ },
191
+ "metadata": {},
192
+ "output_type": "display_data"
193
+ }
194
+ ],
195
+ "source": [
196
+ "plt.figure(figsize = (8, 5), tight_layout = True)\n",
197
+ "plt.bar(value_counts.index, value_counts.values, color=['DodgerBlue', 'Crimson'])\n",
198
+ "\n",
199
+ "plt.xlabel(\"Label\")\n",
200
+ "plt.ylabel(\"Count\")\n",
201
+ "plt.title(\"Value Counts\")\n",
202
+ "\n",
203
+ "plt.tight_layout()\n",
204
+ "plt.show()"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "markdown",
209
+ "id": "8d41f8c8",
210
+ "metadata": {},
211
+ "source": [
212
+ "## 2. Preprocessing"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "markdown",
217
+ "id": "37ea4009",
218
+ "metadata": {},
219
+ "source": [
220
+ "## 2.1. Dropping Non-English Rows"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 7,
226
+ "id": "f8ef682a",
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "from langdetect import detect\n",
231
+ "\n",
232
+ "def is_english(tweet):\n",
233
+ "\n",
234
+ " try:\n",
235
+ " return detect(tweet) == 'en'\n",
236
+ "\n",
237
+ " except:\n",
238
+ " return False"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 8,
244
+ "id": "cc62ea88",
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "name": "stdout",
249
+ "output_type": "stream",
250
+ "text": [
251
+ "Shape of DataFrame after dropping non-English Emails: (78940, 2)\n"
252
+ ]
253
+ },
254
+ {
255
+ "data": {
256
+ "text/html": [
257
+ "<div>\n",
258
+ "<style scoped>\n",
259
+ " .dataframe tbody tr th:only-of-type {\n",
260
+ " vertical-align: middle;\n",
261
+ " }\n",
262
+ "\n",
263
+ " .dataframe tbody tr th {\n",
264
+ " vertical-align: top;\n",
265
+ " }\n",
266
+ "\n",
267
+ " .dataframe thead th {\n",
268
+ " text-align: right;\n",
269
+ " }\n",
270
+ "</style>\n",
271
+ "<table border=\"1\" class=\"dataframe\">\n",
272
+ " <thead>\n",
273
+ " <tr style=\"text-align: right;\">\n",
274
+ " <th></th>\n",
275
+ " <th>label</th>\n",
276
+ " <th>text</th>\n",
277
+ " </tr>\n",
278
+ " </thead>\n",
279
+ " <tbody>\n",
280
+ " <tr>\n",
281
+ " <th>0</th>\n",
282
+ " <td>1</td>\n",
283
+ " <td>ounce feather bowl hummingbird opec moment ala...</td>\n",
284
+ " </tr>\n",
285
+ " <tr>\n",
286
+ " <th>1</th>\n",
287
+ " <td>1</td>\n",
288
+ " <td>wulvob get your medircations online qnb ikud v...</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>2</th>\n",
292
+ " <td>0</td>\n",
293
+ " <td>computer connection from cnn com wednesday es...</td>\n",
294
+ " </tr>\n",
295
+ " <tr>\n",
296
+ " <th>3</th>\n",
297
+ " <td>1</td>\n",
298
+ " <td>university degree obtain a prosperous future m...</td>\n",
299
+ " </tr>\n",
300
+ " <tr>\n",
301
+ " <th>4</th>\n",
302
+ " <td>0</td>\n",
303
+ " <td>thanks for all your answers guys i know i shou...</td>\n",
304
+ " </tr>\n",
305
+ " </tbody>\n",
306
+ "</table>\n",
307
+ "</div>"
308
+ ],
309
+ "text/plain": [
310
+ " label text\n",
311
+ "0 1 ounce feather bowl hummingbird opec moment ala...\n",
312
+ "1 1 wulvob get your medircations online qnb ikud v...\n",
313
+ "2 0 computer connection from cnn com wednesday es...\n",
314
+ "3 1 university degree obtain a prosperous future m...\n",
315
+ "4 0 thanks for all your answers guys i know i shou..."
316
+ ]
317
+ },
318
+ "execution_count": 8,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "df = df[df['text'].apply(is_english)]\n",
325
+ "\n",
326
+ "print(\"Shape of DataFrame after dropping non-English Emails:\", df.shape)\n",
327
+ "df.head()"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": 9,
333
+ "id": "486a363d",
334
+ "metadata": {},
335
+ "outputs": [
336
+ {
337
+ "name": "stdout",
338
+ "output_type": "stream",
339
+ "text": [
340
+ "['\\x01', '\\x02', '\\x03', '\\x05', '\\x07', '\\x08', '\\n', '\\x0e', '\\x0f', '\\x10', '\\x11', '\\x12', '\\x13', '\\x14', '\\x15', '\\x16', '\\x17', '\\x19', '\\x1b', ' ', '!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\\x7f', '\\x80', '\\x81', '\\x82', '\\x83', '\\x84', '\\x85', '\\x86', '\\x87', '\\x88', '\\x89', '\\x8a', '\\x8b', '\\x8c', '\\x8d', '\\x8e', '\\x8f', '\\x90', '\\x91', '\\x92', '\\x93', '\\x94', '\\x95', '\\x96', '\\x97', '\\x98', '\\x99', '\\x9a', '\\x9b', '\\x9c', '\\x9d', '\\x9e', '\\x9f', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', '×', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'ć', 'č', 'ę', 'ğ', 'ī', 'ł', 'ń', 'ś', 'ż', 'ș', 'ɤ', '̇', 'ͨ', 'β', 'θ', 'π', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ь', 'э', 'ю', 'я', 'ё', 'ђ', 'ѕ', 'њ', 'ќ', 'ӿ', 'א', 'ב', 'ו', 'ח', 'י', 'ל', 'ם', 'מ', 'נ', 'פ', 'ר', 'ש', 'ת', 'ث', 'ڶ', 'ṁ', '–', '—', '‘', '’', '“', '”', '†', '‡', '•', '…', '⁄', '€', '№', '™', '−', '∷', '⌂', '☺', '♣', '✗', '。', 'み', 'む', 'も', 'シ', 'デ', 'モ', 'ュ', 'リ', 'ル', 'ン', '・', 'ㄞ', 'ㄢ', '上', '世', '中', '享', '件', '俊', '信', '写', '劎', '大', '子', '小', '我', '拒', '文', '最', '李', '杰', '楊', '用', '电', '界', '的', '系', '统', '膄', '膅', '膆', '道', '邮', 'fi', '(', ')', '', '�']\n"
341
+ ]
342
+ }
343
+ ],
344
+ "source": [
345
+ "all_text = ''.join(df['text'].tolist())\n",
346
+ "unique_letters = sorted(list(set(all_text)))\n",
347
+ "\n",
348
+ "print(unique_letters)"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "markdown",
353
+ "id": "1069a4c0",
354
+ "metadata": {},
355
+ "source": [
356
+ "# 2.2. Text Cleaning"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": 10,
362
+ "id": "0d57fec8",
363
+ "metadata": {},
364
+ "outputs": [
365
+ {
366
+ "name": "stderr",
367
+ "output_type": "stream",
368
+ "text": [
369
+ "[nltk_data] Downloading package wordnet to /home/mohamed-\n",
370
+ "[nltk_data] hamdy/nltk_data...\n",
371
+ "[nltk_data] Package wordnet is already up-to-date!\n",
372
+ "[nltk_data] Downloading package stopwords to /home/mohamed-\n",
373
+ "[nltk_data] hamdy/nltk_data...\n",
374
+ "[nltk_data] Package stopwords is already up-to-date!\n"
375
+ ]
376
+ },
377
+ {
378
+ "data": {
379
+ "text/plain": [
380
+ "True"
381
+ ]
382
+ },
383
+ "execution_count": 10,
384
+ "metadata": {},
385
+ "output_type": "execute_result"
386
+ }
387
+ ],
388
+ "source": [
389
+ "import re\n",
390
+ "import nltk\n",
391
+ "import contractions\n",
392
+ "from nltk.corpus import stopwords\n",
393
+ "from nltk.stem import WordNetLemmatizer\n",
394
+ "\n",
395
+ "nltk.download('wordnet')\n",
396
+ "nltk.download('stopwords')"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "code",
401
+ "execution_count": null,
402
+ "id": "9fd59c29",
403
+ "metadata": {},
404
+ "outputs": [],
405
+ "source": [
406
+ "def text_cleaning(text):\n",
407
+ "\n",
408
+ " # Converting text to lowercase\n",
409
+ " text = text.lower()\n",
410
+ "\n",
411
+ " # Removing Extra Spaces\n",
412
+ " text = re.sub(r'\\s+', ' ', text).strip()\n",
413
+ "\n",
414
+ " # Replacing Numbers with a Token\n",
415
+ " text = re.sub(r'\\d+', '<NUM>', text)\n",
416
+ "\n",
417
+ " # Normalize Elongated Words\n",
418
+ " text = re.sub(r'(.)\\1+', r'\\1\\1', text) \n",
419
+ "\n",
420
+ " # Expand Contractions (e.g.: weren't => were not)\n",
421
+ " text = contractions.fix(text)\n",
422
+ " \n",
423
+ " # Removing Punctuations and Non-English Charachters\n",
424
+ " text = re.sub(r'[^a-z0-9\\s]', '', text) \n",
425
+ "\n",
426
+ " # Lemmatization \n",
427
+ " words = text.split()\n",
428
+ " lemmatizer = WordNetLemmatizer()\n",
429
+ " words = [lemmatizer.lemmatize(word) for word in words]\n",
430
+ "\n",
431
+ " # Returning the Cleaned Text \n",
432
+ " cleaned_text = ' '.join(words)\n",
433
+ " return cleaned_text"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": 12,
439
+ "id": "4b0d2f0c",
440
+ "metadata": {},
441
+ "outputs": [
442
+ {
443
+ "data": {
444
+ "text/plain": [
445
+ "0 ounce feather bowl hummingbird opec moment ala...\n",
446
+ "1 wulvob get your medircations online qnb ikud v...\n",
447
+ "2 computer connection from cnn com wednesday esc...\n",
448
+ "3 university degree obtain a prosperous future m...\n",
449
+ "4 thanks for all your answer guy i know i should...\n",
450
+ "Name: text, dtype: object"
451
+ ]
452
+ },
453
+ "execution_count": 12,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "df['text'] = df['text'].apply(text_cleaning)\n",
460
+ "df['text'].head()"
461
+ ]
462
+ },
463
+ {
464
+ "cell_type": "code",
465
+ "execution_count": 13,
466
+ "id": "786e21f6",
467
+ "metadata": {},
468
+ "outputs": [
469
+ {
470
+ "name": "stdout",
471
+ "output_type": "stream",
472
+ "text": [
473
+ "[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
474
+ ]
475
+ }
476
+ ],
477
+ "source": [
478
+ "all_text = ''.join(df['text'].tolist())\n",
479
+ "unique_letters = sorted(list(set(all_text)))\n",
480
+ "\n",
481
+ "print(unique_letters)"
482
+ ]
483
+ },
484
+ {
485
+ "cell_type": "markdown",
486
+ "id": "e9d4da45",
487
+ "metadata": {},
488
+ "source": [
489
+ "## 2.3. Train Test Split"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 14,
495
+ "id": "15306c73",
496
+ "metadata": {},
497
+ "outputs": [
498
+ {
499
+ "data": {
500
+ "text/html": [
501
+ "<div>\n",
502
+ "<style scoped>\n",
503
+ " .dataframe tbody tr th:only-of-type {\n",
504
+ " vertical-align: middle;\n",
505
+ " }\n",
506
+ "\n",
507
+ " .dataframe tbody tr th {\n",
508
+ " vertical-align: top;\n",
509
+ " }\n",
510
+ "\n",
511
+ " .dataframe thead th {\n",
512
+ " text-align: right;\n",
513
+ " }\n",
514
+ "</style>\n",
515
+ "<table border=\"1\" class=\"dataframe\">\n",
516
+ " <thead>\n",
517
+ " <tr style=\"text-align: right;\">\n",
518
+ " <th></th>\n",
519
+ " <th>label</th>\n",
520
+ " <th>text</th>\n",
521
+ " </tr>\n",
522
+ " </thead>\n",
523
+ " <tbody>\n",
524
+ " <tr>\n",
525
+ " <th>51626</th>\n",
526
+ " <td>0</td>\n",
527
+ " <td>below is a list of the major item that are sti...</td>\n",
528
+ " </tr>\n",
529
+ " <tr>\n",
530
+ " <th>76658</th>\n",
531
+ " <td>0</td>\n",
532
+ " <td>escapenumberfxml version escapenumberd escapen...</td>\n",
533
+ " </tr>\n",
534
+ " <tr>\n",
535
+ " <th>5491</th>\n",
536
+ " <td>1</td>\n",
537
+ " <td>our offer are unbeatable and we always update ...</td>\n",
538
+ " </tr>\n",
539
+ " <tr>\n",
540
+ " <th>6697</th>\n",
541
+ " <td>1</td>\n",
542
+ " <td>chms pioneering explosive wireless niche in ch...</td>\n",
543
+ " </tr>\n",
544
+ " <tr>\n",
545
+ " <th>66300</th>\n",
546
+ " <td>1</td>\n",
547
+ " <td>hi there try our market leading product c i a ...</td>\n",
548
+ " </tr>\n",
549
+ " </tbody>\n",
550
+ "</table>\n",
551
+ "</div>"
552
+ ],
553
+ "text/plain": [
554
+ " label text\n",
555
+ "51626 0 below is a list of the major item that are sti...\n",
556
+ "76658 0 escapenumberfxml version escapenumberd escapen...\n",
557
+ "5491 1 our offer are unbeatable and we always update ...\n",
558
+ "6697 1 chms pioneering explosive wireless niche in ch...\n",
559
+ "66300 1 hi there try our market leading product c i a ..."
560
+ ]
561
+ },
562
+ "execution_count": 14,
563
+ "metadata": {},
564
+ "output_type": "execute_result"
565
+ }
566
+ ],
567
+ "source": [
568
+ "from sklearn.model_selection import train_test_split\n",
569
+ "\n",
570
+ "train_df, test_df = train_test_split(\n",
571
+ " df, \n",
572
+ " test_size = 0.2, \n",
573
+ " random_state = 42, \n",
574
+ " stratify = df['label'],\n",
575
+ " shuffle = True\n",
576
+ ")\n",
577
+ "\n",
578
+ "pd.DataFrame(train_df).to_csv('../data/processed/train.csv', index = False)\n",
579
+ "pd.DataFrame(test_df).to_csv('../data/processed/test.csv', index = False)\n",
580
+ "test_df.head()"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": 15,
586
+ "id": "187269f3",
587
+ "metadata": {},
588
+ "outputs": [],
589
+ "source": [
590
+ "X_train = train_df['text']\n",
591
+ "X_test = test_df['text']\n",
592
+ "y_train = train_df['label']\n",
593
+ "y_test = test_df['label']"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": 16,
599
+ "id": "1ed04b87",
600
+ "metadata": {},
601
+ "outputs": [
602
+ {
603
+ "name": "stdout",
604
+ "output_type": "stream",
605
+ "text": [
606
+ "Shape of X_train: (63152,)\n",
607
+ "Shape of X_test: (15788,)\n",
608
+ "Shape of y_train: (63152,)\n",
609
+ "Shape of y_test: (15788,)\n"
610
+ ]
611
+ }
612
+ ],
613
+ "source": [
614
+ "print(f\"Shape of X_train: {X_train.shape}\")\n",
615
+ "print(f\"Shape of X_test: {X_test.shape}\")\n",
616
+ "print(f\"Shape of y_train: {y_train.shape}\")\n",
617
+ "print(f\"Shape of y_test: {y_test.shape}\")"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "markdown",
622
+ "id": "cc846c18",
623
+ "metadata": {},
624
+ "source": [
625
+ "# 2.4. Vectorization"
626
+ ]
627
+ },
628
+ {
629
+ "cell_type": "code",
630
+ "execution_count": 17,
631
+ "id": "a2587078",
632
+ "metadata": {},
633
+ "outputs": [
634
+ {
635
+ "data": {
636
+ "text/plain": [
637
+ "<Compressed Sparse Row sparse matrix of dtype 'float64'\n",
638
+ "\twith 5462956 stored elements and shape (63152, 212929)>"
639
+ ]
640
+ },
641
+ "execution_count": 17,
642
+ "metadata": {},
643
+ "output_type": "execute_result"
644
+ }
645
+ ],
646
+ "source": [
647
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
648
+ "\n",
649
+ "vectorizer = TfidfVectorizer(stop_words = 'english')\n",
650
+ "X_train = vectorizer.fit_transform(X_train) \n",
651
+ "X_test = vectorizer.transform(X_test)\n",
652
+ "\n",
653
+ "X_train "
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "markdown",
658
+ "id": "4b3b4cba",
659
+ "metadata": {},
660
+ "source": [
661
+ "# 3. Model"
662
+ ]
663
+ },
664
+ {
665
+ "cell_type": "markdown",
666
+ "id": "a81b3b8c",
667
+ "metadata": {},
668
+ "source": [
669
+ "# 3.1. Training"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "code",
674
+ "execution_count": 47,
675
+ "id": "d234d65f",
676
+ "metadata": {},
677
+ "outputs": [
678
+ {
679
+ "data": {
680
+ "text/html": [
681
+ "<style>#sk-container-id-5 {\n",
682
+ " /* Definition of color scheme common for light and dark mode */\n",
683
+ " --sklearn-color-text: #000;\n",
684
+ " --sklearn-color-text-muted: #666;\n",
685
+ " --sklearn-color-line: gray;\n",
686
+ " /* Definition of color scheme for unfitted estimators */\n",
687
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
688
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
689
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
690
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
691
+ " /* Definition of color scheme for fitted estimators */\n",
692
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
693
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
694
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
695
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
696
+ "\n",
697
+ " /* Specific color for light theme */\n",
698
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
699
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
700
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
701
+ " --sklearn-color-icon: #696969;\n",
702
+ "\n",
703
+ " @media (prefers-color-scheme: dark) {\n",
704
+ " /* Redefinition of color scheme for dark theme */\n",
705
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
706
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
707
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
708
+ " --sklearn-color-icon: #878787;\n",
709
+ " }\n",
710
+ "}\n",
711
+ "\n",
712
+ "#sk-container-id-5 {\n",
713
+ " color: var(--sklearn-color-text);\n",
714
+ "}\n",
715
+ "\n",
716
+ "#sk-container-id-5 pre {\n",
717
+ " padding: 0;\n",
718
+ "}\n",
719
+ "\n",
720
+ "#sk-container-id-5 input.sk-hidden--visually {\n",
721
+ " border: 0;\n",
722
+ " clip: rect(1px 1px 1px 1px);\n",
723
+ " clip: rect(1px, 1px, 1px, 1px);\n",
724
+ " height: 1px;\n",
725
+ " margin: -1px;\n",
726
+ " overflow: hidden;\n",
727
+ " padding: 0;\n",
728
+ " position: absolute;\n",
729
+ " width: 1px;\n",
730
+ "}\n",
731
+ "\n",
732
+ "#sk-container-id-5 div.sk-dashed-wrapped {\n",
733
+ " border: 1px dashed var(--sklearn-color-line);\n",
734
+ " margin: 0 0.4em 0.5em 0.4em;\n",
735
+ " box-sizing: border-box;\n",
736
+ " padding-bottom: 0.4em;\n",
737
+ " background-color: var(--sklearn-color-background);\n",
738
+ "}\n",
739
+ "\n",
740
+ "#sk-container-id-5 div.sk-container {\n",
741
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
742
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
743
+ " so we also need the `!important` here to be able to override the\n",
744
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
745
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
746
+ " display: inline-block !important;\n",
747
+ " position: relative;\n",
748
+ "}\n",
749
+ "\n",
750
+ "#sk-container-id-5 div.sk-text-repr-fallback {\n",
751
+ " display: none;\n",
752
+ "}\n",
753
+ "\n",
754
+ "div.sk-parallel-item,\n",
755
+ "div.sk-serial,\n",
756
+ "div.sk-item {\n",
757
+ " /* draw centered vertical line to link estimators */\n",
758
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
759
+ " background-size: 2px 100%;\n",
760
+ " background-repeat: no-repeat;\n",
761
+ " background-position: center center;\n",
762
+ "}\n",
763
+ "\n",
764
+ "/* Parallel-specific style estimator block */\n",
765
+ "\n",
766
+ "#sk-container-id-5 div.sk-parallel-item::after {\n",
767
+ " content: \"\";\n",
768
+ " width: 100%;\n",
769
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
770
+ " flex-grow: 1;\n",
771
+ "}\n",
772
+ "\n",
773
+ "#sk-container-id-5 div.sk-parallel {\n",
774
+ " display: flex;\n",
775
+ " align-items: stretch;\n",
776
+ " justify-content: center;\n",
777
+ " background-color: var(--sklearn-color-background);\n",
778
+ " position: relative;\n",
779
+ "}\n",
780
+ "\n",
781
+ "#sk-container-id-5 div.sk-parallel-item {\n",
782
+ " display: flex;\n",
783
+ " flex-direction: column;\n",
784
+ "}\n",
785
+ "\n",
786
+ "#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
787
+ " align-self: flex-end;\n",
788
+ " width: 50%;\n",
789
+ "}\n",
790
+ "\n",
791
+ "#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
792
+ " align-self: flex-start;\n",
793
+ " width: 50%;\n",
794
+ "}\n",
795
+ "\n",
796
+ "#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
797
+ " width: 0;\n",
798
+ "}\n",
799
+ "\n",
800
+ "/* Serial-specific style estimator block */\n",
801
+ "\n",
802
+ "#sk-container-id-5 div.sk-serial {\n",
803
+ " display: flex;\n",
804
+ " flex-direction: column;\n",
805
+ " align-items: center;\n",
806
+ " background-color: var(--sklearn-color-background);\n",
807
+ " padding-right: 1em;\n",
808
+ " padding-left: 1em;\n",
809
+ "}\n",
810
+ "\n",
811
+ "\n",
812
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
813
+ "clickable and can be expanded/collapsed.\n",
814
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
815
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
816
+ "*/\n",
817
+ "\n",
818
+ "/* Pipeline and ColumnTransformer style (default) */\n",
819
+ "\n",
820
+ "#sk-container-id-5 div.sk-toggleable {\n",
821
+ " /* Default theme specific background. It is overwritten whether we have a\n",
822
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
823
+ " background-color: var(--sklearn-color-background);\n",
824
+ "}\n",
825
+ "\n",
826
+ "/* Toggleable label */\n",
827
+ "#sk-container-id-5 label.sk-toggleable__label {\n",
828
+ " cursor: pointer;\n",
829
+ " display: flex;\n",
830
+ " width: 100%;\n",
831
+ " margin-bottom: 0;\n",
832
+ " padding: 0.5em;\n",
833
+ " box-sizing: border-box;\n",
834
+ " text-align: center;\n",
835
+ " align-items: start;\n",
836
+ " justify-content: space-between;\n",
837
+ " gap: 0.5em;\n",
838
+ "}\n",
839
+ "\n",
840
+ "#sk-container-id-5 label.sk-toggleable__label .caption {\n",
841
+ " font-size: 0.6rem;\n",
842
+ " font-weight: lighter;\n",
843
+ " color: var(--sklearn-color-text-muted);\n",
844
+ "}\n",
845
+ "\n",
846
+ "#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
847
+ " /* Arrow on the left of the label */\n",
848
+ " content: \"▸\";\n",
849
+ " float: left;\n",
850
+ " margin-right: 0.25em;\n",
851
+ " color: var(--sklearn-color-icon);\n",
852
+ "}\n",
853
+ "\n",
854
+ "#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
855
+ " color: var(--sklearn-color-text);\n",
856
+ "}\n",
857
+ "\n",
858
+ "/* Toggleable content - dropdown */\n",
859
+ "\n",
860
+ "#sk-container-id-5 div.sk-toggleable__content {\n",
861
+ " display: none;\n",
862
+ " text-align: left;\n",
863
+ " /* unfitted */\n",
864
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
865
+ "}\n",
866
+ "\n",
867
+ "#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
868
+ " /* fitted */\n",
869
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
870
+ "}\n",
871
+ "\n",
872
+ "#sk-container-id-5 div.sk-toggleable__content pre {\n",
873
+ " margin: 0.2em;\n",
874
+ " border-radius: 0.25em;\n",
875
+ " color: var(--sklearn-color-text);\n",
876
+ " /* unfitted */\n",
877
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
878
+ "}\n",
879
+ "\n",
880
+ "#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
881
+ " /* unfitted */\n",
882
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
883
+ "}\n",
884
+ "\n",
885
+ "#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
886
+ " /* Expand drop-down */\n",
887
+ " display: block;\n",
888
+ " width: 100%;\n",
889
+ " overflow: visible;\n",
890
+ "}\n",
891
+ "\n",
892
+ "#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
893
+ " content: \"▾\";\n",
894
+ "}\n",
895
+ "\n",
896
+ "/* Pipeline/ColumnTransformer-specific style */\n",
897
+ "\n",
898
+ "#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
899
+ " color: var(--sklearn-color-text);\n",
900
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
901
+ "}\n",
902
+ "\n",
903
+ "#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
904
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
905
+ "}\n",
906
+ "\n",
907
+ "/* Estimator-specific style */\n",
908
+ "\n",
909
+ "/* Colorize estimator box */\n",
910
+ "#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
911
+ " /* unfitted */\n",
912
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
913
+ "}\n",
914
+ "\n",
915
+ "#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
916
+ " /* fitted */\n",
917
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
918
+ "}\n",
919
+ "\n",
920
+ "#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
921
+ "#sk-container-id-5 div.sk-label label {\n",
922
+ " /* The background is the default theme color */\n",
923
+ " color: var(--sklearn-color-text-on-default-background);\n",
924
+ "}\n",
925
+ "\n",
926
+ "/* On hover, darken the color of the background */\n",
927
+ "#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
928
+ " color: var(--sklearn-color-text);\n",
929
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
930
+ "}\n",
931
+ "\n",
932
+ "/* Label box, darken color on hover, fitted */\n",
933
+ "#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
934
+ " color: var(--sklearn-color-text);\n",
935
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
936
+ "}\n",
937
+ "\n",
938
+ "/* Estimator label */\n",
939
+ "\n",
940
+ "#sk-container-id-5 div.sk-label label {\n",
941
+ " font-family: monospace;\n",
942
+ " font-weight: bold;\n",
943
+ " display: inline-block;\n",
944
+ " line-height: 1.2em;\n",
945
+ "}\n",
946
+ "\n",
947
+ "#sk-container-id-5 div.sk-label-container {\n",
948
+ " text-align: center;\n",
949
+ "}\n",
950
+ "\n",
951
+ "/* Estimator-specific */\n",
952
+ "#sk-container-id-5 div.sk-estimator {\n",
953
+ " font-family: monospace;\n",
954
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
955
+ " border-radius: 0.25em;\n",
956
+ " box-sizing: border-box;\n",
957
+ " margin-bottom: 0.5em;\n",
958
+ " /* unfitted */\n",
959
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
960
+ "}\n",
961
+ "\n",
962
+ "#sk-container-id-5 div.sk-estimator.fitted {\n",
963
+ " /* fitted */\n",
964
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
965
+ "}\n",
966
+ "\n",
967
+ "/* on hover */\n",
968
+ "#sk-container-id-5 div.sk-estimator:hover {\n",
969
+ " /* unfitted */\n",
970
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
971
+ "}\n",
972
+ "\n",
973
+ "#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
974
+ " /* fitted */\n",
975
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
976
+ "}\n",
977
+ "\n",
978
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
979
+ "\n",
980
+ "/* Common style for \"i\" and \"?\" */\n",
981
+ "\n",
982
+ ".sk-estimator-doc-link,\n",
983
+ "a:link.sk-estimator-doc-link,\n",
984
+ "a:visited.sk-estimator-doc-link {\n",
985
+ " float: right;\n",
986
+ " font-size: smaller;\n",
987
+ " line-height: 1em;\n",
988
+ " font-family: monospace;\n",
989
+ " background-color: var(--sklearn-color-background);\n",
990
+ " border-radius: 1em;\n",
991
+ " height: 1em;\n",
992
+ " width: 1em;\n",
993
+ " text-decoration: none !important;\n",
994
+ " margin-left: 0.5em;\n",
995
+ " text-align: center;\n",
996
+ " /* unfitted */\n",
997
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
998
+ " color: var(--sklearn-color-unfitted-level-1);\n",
999
+ "}\n",
1000
+ "\n",
1001
+ ".sk-estimator-doc-link.fitted,\n",
1002
+ "a:link.sk-estimator-doc-link.fitted,\n",
1003
+ "a:visited.sk-estimator-doc-link.fitted {\n",
1004
+ " /* fitted */\n",
1005
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1006
+ " color: var(--sklearn-color-fitted-level-1);\n",
1007
+ "}\n",
1008
+ "\n",
1009
+ "/* On hover */\n",
1010
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
1011
+ ".sk-estimator-doc-link:hover,\n",
1012
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
1013
+ ".sk-estimator-doc-link:hover {\n",
1014
+ " /* unfitted */\n",
1015
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1016
+ " color: var(--sklearn-color-background);\n",
1017
+ " text-decoration: none;\n",
1018
+ "}\n",
1019
+ "\n",
1020
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
1021
+ ".sk-estimator-doc-link.fitted:hover,\n",
1022
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
1023
+ ".sk-estimator-doc-link.fitted:hover {\n",
1024
+ " /* fitted */\n",
1025
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1026
+ " color: var(--sklearn-color-background);\n",
1027
+ " text-decoration: none;\n",
1028
+ "}\n",
1029
+ "\n",
1030
+ "/* Span, style for the box shown on hovering the info icon */\n",
1031
+ ".sk-estimator-doc-link span {\n",
1032
+ " display: none;\n",
1033
+ " z-index: 9999;\n",
1034
+ " position: relative;\n",
1035
+ " font-weight: normal;\n",
1036
+ " right: .2ex;\n",
1037
+ " padding: .5ex;\n",
1038
+ " margin: .5ex;\n",
1039
+ " width: min-content;\n",
1040
+ " min-width: 20ex;\n",
1041
+ " max-width: 50ex;\n",
1042
+ " color: var(--sklearn-color-text);\n",
1043
+ " box-shadow: 2pt 2pt 4pt #999;\n",
1044
+ " /* unfitted */\n",
1045
+ " background: var(--sklearn-color-unfitted-level-0);\n",
1046
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1047
+ "}\n",
1048
+ "\n",
1049
+ ".sk-estimator-doc-link.fitted span {\n",
1050
+ " /* fitted */\n",
1051
+ " background: var(--sklearn-color-fitted-level-0);\n",
1052
+ " border: var(--sklearn-color-fitted-level-3);\n",
1053
+ "}\n",
1054
+ "\n",
1055
+ ".sk-estimator-doc-link:hover span {\n",
1056
+ " display: block;\n",
1057
+ "}\n",
1058
+ "\n",
1059
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1060
+ "\n",
1061
+ "#sk-container-id-5 a.estimator_doc_link {\n",
1062
+ " float: right;\n",
1063
+ " font-size: 1rem;\n",
1064
+ " line-height: 1em;\n",
1065
+ " font-family: monospace;\n",
1066
+ " background-color: var(--sklearn-color-background);\n",
1067
+ " border-radius: 1rem;\n",
1068
+ " height: 1rem;\n",
1069
+ " width: 1rem;\n",
1070
+ " text-decoration: none;\n",
1071
+ " /* unfitted */\n",
1072
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1073
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1074
+ "}\n",
1075
+ "\n",
1076
+ "#sk-container-id-5 a.estimator_doc_link.fitted {\n",
1077
+ " /* fitted */\n",
1078
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1079
+ " color: var(--sklearn-color-fitted-level-1);\n",
1080
+ "}\n",
1081
+ "\n",
1082
+ "/* On hover */\n",
1083
+ "#sk-container-id-5 a.estimator_doc_link:hover {\n",
1084
+ " /* unfitted */\n",
1085
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1086
+ " color: var(--sklearn-color-background);\n",
1087
+ " text-decoration: none;\n",
1088
+ "}\n",
1089
+ "\n",
1090
+ "#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
1091
+ " /* fitted */\n",
1092
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1093
+ "}\n",
1094
+ "\n",
1095
+ ".estimator-table summary {\n",
1096
+ " padding: .5rem;\n",
1097
+ " font-family: monospace;\n",
1098
+ " cursor: pointer;\n",
1099
+ "}\n",
1100
+ "\n",
1101
+ ".estimator-table details[open] {\n",
1102
+ " padding-left: 0.1rem;\n",
1103
+ " padding-right: 0.1rem;\n",
1104
+ " padding-bottom: 0.3rem;\n",
1105
+ "}\n",
1106
+ "\n",
1107
+ ".estimator-table .parameters-table {\n",
1108
+ " margin-left: auto !important;\n",
1109
+ " margin-right: auto !important;\n",
1110
+ "}\n",
1111
+ "\n",
1112
+ ".estimator-table .parameters-table tr:nth-child(odd) {\n",
1113
+ " background-color: #fff;\n",
1114
+ "}\n",
1115
+ "\n",
1116
+ ".estimator-table .parameters-table tr:nth-child(even) {\n",
1117
+ " background-color: #f6f6f6;\n",
1118
+ "}\n",
1119
+ "\n",
1120
+ ".estimator-table .parameters-table tr:hover {\n",
1121
+ " background-color: #e0e0e0;\n",
1122
+ "}\n",
1123
+ "\n",
1124
+ ".estimator-table table td {\n",
1125
+ " border: 1px solid rgba(106, 105, 104, 0.232);\n",
1126
+ "}\n",
1127
+ "\n",
1128
+ ".user-set td {\n",
1129
+ " color:rgb(255, 94, 0);\n",
1130
+ " text-align: left;\n",
1131
+ "}\n",
1132
+ "\n",
1133
+ ".user-set td.value pre {\n",
1134
+ " color:rgb(255, 94, 0) !important;\n",
1135
+ " background-color: transparent !important;\n",
1136
+ "}\n",
1137
+ "\n",
1138
+ ".default td {\n",
1139
+ " color: black;\n",
1140
+ " text-align: left;\n",
1141
+ "}\n",
1142
+ "\n",
1143
+ ".user-set td i,\n",
1144
+ ".default td i {\n",
1145
+ " color: black;\n",
1146
+ "}\n",
1147
+ "\n",
1148
+ ".copy-paste-icon {\n",
1149
+ " background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0NDggNTEyIj48IS0tIUZvbnQgQXdlc29tZSBGcmVlIDYuNy4yIGJ5IEBmb250YXdlc29tZSAtIGh0dHBzOi8vZm9udGF3ZXNvbWUuY29tIExpY2Vuc2UgLSBodHRwczovL2ZvbnRhd2Vzb21lLmNvbS9saWNlbnNlL2ZyZWUgQ29weXJpZ2h0IDIwMjUgRm9udGljb25zLCBJbmMuLS0+PHBhdGggZD0iTTIwOCAwTDMzMi4xIDBjMTIuNyAwIDI0LjkgNS4xIDMzLjkgMTQuMWw2Ny45IDY3LjljOSA5IDE0LjEgMjEuMiAxNC4xIDMzLjlMNDQ4IDMzNmMwIDI2LjUtMjEuNSA0OC00OCA0OGwtMTkyIDBjLTI2LjUgMC00OC0yMS41LTQ4LTQ4bDAtMjg4YzAtMjYuNSAyMS41LTQ4IDQ4LTQ4ek00OCAxMjhsODAgMCAwIDY0LTY0IDAgMCAyNTYgMTkyIDAgMC0zMiA2NCAwIDAgNDhjMCAyNi41LTIxLjUgNDgtNDggNDhMNDggNTEyYy0yNi41IDAtNDgtMjEuNS00OC00OEwwIDE3NmMwLTI2LjUgMjEuNS00OCA0OC00OHoiLz48L3N2Zz4=);\n",
1150
+ " background-repeat: no-repeat;\n",
1151
+ " background-size: 14px 14px;\n",
1152
+ " background-position: 0;\n",
1153
+ " display: inline-block;\n",
1154
+ " width: 14px;\n",
1155
+ " height: 14px;\n",
1156
+ " cursor: pointer;\n",
1157
+ "}\n",
1158
+ "</style><body><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" checked><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>LogisticRegression</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.7/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\" data-param-prefix=\"\">\n",
1159
+ " <div class=\"estimator-table\">\n",
1160
+ " <details>\n",
1161
+ " <summary>Parameters</summary>\n",
1162
+ " <table class=\"parameters-table\">\n",
1163
+ " <tbody>\n",
1164
+ " \n",
1165
+ " <tr class=\"default\">\n",
1166
+ " <td><i class=\"copy-paste-icon\"\n",
1167
+ " onclick=\"copyToClipboard('penalty',\n",
1168
+ " this.parentElement.nextElementSibling)\"\n",
1169
+ " ></i></td>\n",
1170
+ " <td class=\"param\">penalty&nbsp;</td>\n",
1171
+ " <td class=\"value\">&#x27;l2&#x27;</td>\n",
1172
+ " </tr>\n",
1173
+ " \n",
1174
+ "\n",
1175
+ " <tr class=\"default\">\n",
1176
+ " <td><i class=\"copy-paste-icon\"\n",
1177
+ " onclick=\"copyToClipboard('dual',\n",
1178
+ " this.parentElement.nextElementSibling)\"\n",
1179
+ " ></i></td>\n",
1180
+ " <td class=\"param\">dual&nbsp;</td>\n",
1181
+ " <td class=\"value\">False</td>\n",
1182
+ " </tr>\n",
1183
+ " \n",
1184
+ "\n",
1185
+ " <tr class=\"default\">\n",
1186
+ " <td><i class=\"copy-paste-icon\"\n",
1187
+ " onclick=\"copyToClipboard('tol',\n",
1188
+ " this.parentElement.nextElementSibling)\"\n",
1189
+ " ></i></td>\n",
1190
+ " <td class=\"param\">tol&nbsp;</td>\n",
1191
+ " <td class=\"value\">0.0001</td>\n",
1192
+ " </tr>\n",
1193
+ " \n",
1194
+ "\n",
1195
+ " <tr class=\"default\">\n",
1196
+ " <td><i class=\"copy-paste-icon\"\n",
1197
+ " onclick=\"copyToClipboard('C',\n",
1198
+ " this.parentElement.nextElementSibling)\"\n",
1199
+ " ></i></td>\n",
1200
+ " <td class=\"param\">C&nbsp;</td>\n",
1201
+ " <td class=\"value\">1.0</td>\n",
1202
+ " </tr>\n",
1203
+ " \n",
1204
+ "\n",
1205
+ " <tr class=\"default\">\n",
1206
+ " <td><i class=\"copy-paste-icon\"\n",
1207
+ " onclick=\"copyToClipboard('fit_intercept',\n",
1208
+ " this.parentElement.nextElementSibling)\"\n",
1209
+ " ></i></td>\n",
1210
+ " <td class=\"param\">fit_intercept&nbsp;</td>\n",
1211
+ " <td class=\"value\">True</td>\n",
1212
+ " </tr>\n",
1213
+ " \n",
1214
+ "\n",
1215
+ " <tr class=\"default\">\n",
1216
+ " <td><i class=\"copy-paste-icon\"\n",
1217
+ " onclick=\"copyToClipboard('intercept_scaling',\n",
1218
+ " this.parentElement.nextElementSibling)\"\n",
1219
+ " ></i></td>\n",
1220
+ " <td class=\"param\">intercept_scaling&nbsp;</td>\n",
1221
+ " <td class=\"value\">1</td>\n",
1222
+ " </tr>\n",
1223
+ " \n",
1224
+ "\n",
1225
+ " <tr class=\"default\">\n",
1226
+ " <td><i class=\"copy-paste-icon\"\n",
1227
+ " onclick=\"copyToClipboard('class_weight',\n",
1228
+ " this.parentElement.nextElementSibling)\"\n",
1229
+ " ></i></td>\n",
1230
+ " <td class=\"param\">class_weight&nbsp;</td>\n",
1231
+ " <td class=\"value\">None</td>\n",
1232
+ " </tr>\n",
1233
+ " \n",
1234
+ "\n",
1235
+ " <tr class=\"default\">\n",
1236
+ " <td><i class=\"copy-paste-icon\"\n",
1237
+ " onclick=\"copyToClipboard('random_state',\n",
1238
+ " this.parentElement.nextElementSibling)\"\n",
1239
+ " ></i></td>\n",
1240
+ " <td class=\"param\">random_state&nbsp;</td>\n",
1241
+ " <td class=\"value\">None</td>\n",
1242
+ " </tr>\n",
1243
+ " \n",
1244
+ "\n",
1245
+ " <tr class=\"default\">\n",
1246
+ " <td><i class=\"copy-paste-icon\"\n",
1247
+ " onclick=\"copyToClipboard('solver',\n",
1248
+ " this.parentElement.nextElementSibling)\"\n",
1249
+ " ></i></td>\n",
1250
+ " <td class=\"param\">solver&nbsp;</td>\n",
1251
+ " <td class=\"value\">&#x27;lbfgs&#x27;</td>\n",
1252
+ " </tr>\n",
1253
+ " \n",
1254
+ "\n",
1255
+ " <tr class=\"default\">\n",
1256
+ " <td><i class=\"copy-paste-icon\"\n",
1257
+ " onclick=\"copyToClipboard('max_iter',\n",
1258
+ " this.parentElement.nextElementSibling)\"\n",
1259
+ " ></i></td>\n",
1260
+ " <td class=\"param\">max_iter&nbsp;</td>\n",
1261
+ " <td class=\"value\">100</td>\n",
1262
+ " </tr>\n",
1263
+ " \n",
1264
+ "\n",
1265
+ " <tr class=\"default\">\n",
1266
+ " <td><i class=\"copy-paste-icon\"\n",
1267
+ " onclick=\"copyToClipboard('multi_class',\n",
1268
+ " this.parentElement.nextElementSibling)\"\n",
1269
+ " ></i></td>\n",
1270
+ " <td class=\"param\">multi_class&nbsp;</td>\n",
1271
+ " <td class=\"value\">&#x27;deprecated&#x27;</td>\n",
1272
+ " </tr>\n",
1273
+ " \n",
1274
+ "\n",
1275
+ " <tr class=\"default\">\n",
1276
+ " <td><i class=\"copy-paste-icon\"\n",
1277
+ " onclick=\"copyToClipboard('verbose',\n",
1278
+ " this.parentElement.nextElementSibling)\"\n",
1279
+ " ></i></td>\n",
1280
+ " <td class=\"param\">verbose&nbsp;</td>\n",
1281
+ " <td class=\"value\">0</td>\n",
1282
+ " </tr>\n",
1283
+ " \n",
1284
+ "\n",
1285
+ " <tr class=\"default\">\n",
1286
+ " <td><i class=\"copy-paste-icon\"\n",
1287
+ " onclick=\"copyToClipboard('warm_start',\n",
1288
+ " this.parentElement.nextElementSibling)\"\n",
1289
+ " ></i></td>\n",
1290
+ " <td class=\"param\">warm_start&nbsp;</td>\n",
1291
+ " <td class=\"value\">False</td>\n",
1292
+ " </tr>\n",
1293
+ " \n",
1294
+ "\n",
1295
+ " <tr class=\"default\">\n",
1296
+ " <td><i class=\"copy-paste-icon\"\n",
1297
+ " onclick=\"copyToClipboard('n_jobs',\n",
1298
+ " this.parentElement.nextElementSibling)\"\n",
1299
+ " ></i></td>\n",
1300
+ " <td class=\"param\">n_jobs&nbsp;</td>\n",
1301
+ " <td class=\"value\">None</td>\n",
1302
+ " </tr>\n",
1303
+ " \n",
1304
+ "\n",
1305
+ " <tr class=\"default\">\n",
1306
+ " <td><i class=\"copy-paste-icon\"\n",
1307
+ " onclick=\"copyToClipboard('l1_ratio',\n",
1308
+ " this.parentElement.nextElementSibling)\"\n",
1309
+ " ></i></td>\n",
1310
+ " <td class=\"param\">l1_ratio&nbsp;</td>\n",
1311
+ " <td class=\"value\">None</td>\n",
1312
+ " </tr>\n",
1313
+ " \n",
1314
+ " </tbody>\n",
1315
+ " </table>\n",
1316
+ " </details>\n",
1317
+ " </div>\n",
1318
+ " </div></div></div></div></div><script>function copyToClipboard(text, element) {\n",
1319
+ " // Get the parameter prefix from the closest toggleable content\n",
1320
+ " const toggleableContent = element.closest('.sk-toggleable__content');\n",
1321
+ " const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
1322
+ " const fullParamName = paramPrefix ? `${paramPrefix}${text}` : text;\n",
1323
+ "\n",
1324
+ " const originalStyle = element.style;\n",
1325
+ " const computedStyle = window.getComputedStyle(element);\n",
1326
+ " const originalWidth = computedStyle.width;\n",
1327
+ " const originalHTML = element.innerHTML.replace('Copied!', '');\n",
1328
+ "\n",
1329
+ " navigator.clipboard.writeText(fullParamName)\n",
1330
+ " .then(() => {\n",
1331
+ " element.style.width = originalWidth;\n",
1332
+ " element.style.color = 'green';\n",
1333
+ " element.innerHTML = \"Copied!\";\n",
1334
+ "\n",
1335
+ " setTimeout(() => {\n",
1336
+ " element.innerHTML = originalHTML;\n",
1337
+ " element.style = originalStyle;\n",
1338
+ " }, 2000);\n",
1339
+ " })\n",
1340
+ " .catch(err => {\n",
1341
+ " console.error('Failed to copy:', err);\n",
1342
+ " element.style.color = 'red';\n",
1343
+ " element.innerHTML = \"Failed!\";\n",
1344
+ " setTimeout(() => {\n",
1345
+ " element.innerHTML = originalHTML;\n",
1346
+ " element.style = originalStyle;\n",
1347
+ " }, 2000);\n",
1348
+ " });\n",
1349
+ " return false;\n",
1350
+ "}\n",
1351
+ "\n",
1352
+ "document.querySelectorAll('.fa-regular.fa-copy').forEach(function(element) {\n",
1353
+ " const toggleableContent = element.closest('.sk-toggleable__content');\n",
1354
+ " const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
1355
+ " const paramName = element.parentElement.nextElementSibling.textContent.trim();\n",
1356
+ " const fullParamName = paramPrefix ? `${paramPrefix}${paramName}` : paramName;\n",
1357
+ "\n",
1358
+ " element.setAttribute('title', fullParamName);\n",
1359
+ "});\n",
1360
+ "</script></body>"
1361
+ ],
1362
+ "text/plain": [
1363
+ "LogisticRegression()"
1364
+ ]
1365
+ },
1366
+ "execution_count": 47,
1367
+ "metadata": {},
1368
+ "output_type": "execute_result"
1369
+ }
1370
+ ],
1371
+ "source": [
1372
+ "from sklearn.linear_model import LogisticRegression\n",
1373
+ "\n",
1374
+ "model = LogisticRegression()\n",
1375
+ "model.fit(X_train, y_train) "
1376
+ ]
1377
+ },
1378
+ {
1379
+ "cell_type": "markdown",
1380
+ "id": "44e69e07",
1381
+ "metadata": {},
1382
+ "source": [
1383
+ "# 3.2. Evaluation"
1384
+ ]
1385
+ },
1386
+ {
1387
+ "cell_type": "code",
1388
+ "execution_count": 48,
1389
+ "id": "9854e091",
1390
+ "metadata": {},
1391
+ "outputs": [
1392
+ {
1393
+ "data": {
1394
+ "text/plain": [
1395
+ "array([0, 0, 1, 1, 1])"
1396
+ ]
1397
+ },
1398
+ "execution_count": 48,
1399
+ "metadata": {},
1400
+ "output_type": "execute_result"
1401
+ }
1402
+ ],
1403
+ "source": [
1404
+ "y_pred = model.predict(X_test)\n",
1405
+ "y_pred[:5] "
1406
+ ]
1407
+ },
1408
+ {
1409
+ "cell_type": "code",
1410
+ "execution_count": 50,
1411
+ "id": "6a2e267d",
1412
+ "metadata": {},
1413
+ "outputs": [
1414
+ {
1415
+ "name": "stdout",
1416
+ "output_type": "stream",
1417
+ "text": [
1418
+ " precision recall f1-score support\n",
1419
+ "\n",
1420
+ " 0 0.98 0.99 0.98 7548\n",
1421
+ " 1 0.99 0.98 0.98 8240\n",
1422
+ "\n",
1423
+ " accuracy 0.98 15788\n",
1424
+ " macro avg 0.98 0.98 0.98 15788\n",
1425
+ "weighted avg 0.98 0.98 0.98 15788\n",
1426
+ "\n"
1427
+ ]
1428
+ }
1429
+ ],
1430
+ "source": [
1431
+ "from sklearn.metrics import classification_report\n",
1432
+ "\n",
1433
+ "report = classification_report(y_pred, y_test)\n",
1434
+ "print(report)"
1435
+ ]
1436
+ },
1437
+ {
1438
+ "cell_type": "code",
1439
+ "execution_count": 21,
1440
+ "id": "26427da2",
1441
+ "metadata": {},
1442
+ "outputs": [
1443
+ {
1444
+ "name": "stdout",
1445
+ "output_type": "stream",
1446
+ "text": [
1447
+ "accuracy = 97.5%\n",
1448
+ "precision_macro = 97.5%\n",
1449
+ "recall_macro = 97.5%\n",
1450
+ "f1_macro = 97.5%\n"
1451
+ ]
1452
+ }
1453
+ ],
1454
+ "source": [
1455
+ "import numpy as np\n",
1456
+ "from sklearn.model_selection import StratifiedKFold, cross_validate \n",
1457
+ "\n",
1458
+ "metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']\n",
1459
+ "k_fold_cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)\n",
1460
+ "scoring = cross_validate(model, X_test, y_test, scoring = metrics, cv = k_fold_cv)\n",
1461
+ "\n",
1462
+ "for metric in metrics:\n",
1463
+ " score = np.mean(scoring[f'test_{metric}'])\n",
1464
+ " score = round(score, 3) * 100\n",
1465
+ " print(f\"{metric} = {score}%\")"
1466
+ ]
1467
+ },
1468
+ {
1469
+ "cell_type": "markdown",
1470
+ "id": "95789a38",
1471
+ "metadata": {},
1472
+ "source": [
1473
+ "# 3.3. Saving Model and Vectorizer"
1474
+ ]
1475
+ },
1476
+ {
1477
+ "cell_type": "code",
1478
+ "execution_count": null,
1479
+ "id": "3c63ead2",
1480
+ "metadata": {},
1481
+ "outputs": [],
1482
+ "source": [
1483
+ "import pickle \n",
1484
+ "\n",
1485
+ "with open('../models/logistic_regression.pkl', 'wb') as file:\n",
1486
+ " pickle.dump(model, file)"
1487
+ ]
1488
+ },
1489
+ {
1490
+ "cell_type": "code",
1491
+ "execution_count": 53,
1492
+ "id": "36d617fa",
1493
+ "metadata": {},
1494
+ "outputs": [],
1495
+ "source": [
1496
+ "with open('../models/vectorizer.pkl', 'wb') as file:\n",
1497
+ " pickle.dump(vectorizer, file)"
1498
+ ]
1499
+ },
1500
+ {
1501
+ "cell_type": "code",
1502
+ "execution_count": null,
1503
+ "id": "c8277f43",
1504
+ "metadata": {},
1505
+ "outputs": [],
1506
+ "source": []
1507
+ }
1508
+ ],
1509
+ "metadata": {
1510
+ "kernelspec": {
1511
+ "display_name": ".venv",
1512
+ "language": "python",
1513
+ "name": "python3"
1514
+ },
1515
+ "language_info": {
1516
+ "codemirror_mode": {
1517
+ "name": "ipython",
1518
+ "version": 3
1519
+ },
1520
+ "file_extension": ".py",
1521
+ "mimetype": "text/x-python",
1522
+ "name": "python",
1523
+ "nbconvert_exporter": "python",
1524
+ "pygments_lexer": "ipython3",
1525
+ "version": "3.12.9"
1526
+ }
1527
+ },
1528
+ "nbformat": 4,
1529
+ "nbformat_minor": 5
1530
+ }
notebooks/Naive_Bayes.ipynb ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cf0be37c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Reading Processed Data"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 2,
14
+ "id": "43a52e6b",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import pandas as pd"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "34291c43",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/html": [
30
+ "<div>\n",
31
+ "<style scoped>\n",
32
+ " .dataframe tbody tr th:only-of-type {\n",
33
+ " vertical-align: middle;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe tbody tr th {\n",
37
+ " vertical-align: top;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe thead th {\n",
41
+ " text-align: right;\n",
42
+ " }\n",
43
+ "</style>\n",
44
+ "<table border=\"1\" class=\"dataframe\">\n",
45
+ " <thead>\n",
46
+ " <tr style=\"text-align: right;\">\n",
47
+ " <th></th>\n",
48
+ " <th>label</th>\n",
49
+ " <th>text</th>\n",
50
+ " </tr>\n",
51
+ " </thead>\n",
52
+ " <tbody>\n",
53
+ " <tr>\n",
54
+ " <th>0</th>\n",
55
+ " <td>0</td>\n",
56
+ " <td>subscribe change profile contact u long term e...</td>\n",
57
+ " </tr>\n",
58
+ " <tr>\n",
59
+ " <th>1</th>\n",
60
+ " <td>1</td>\n",
61
+ " <td>hi we have a new opportunity for you and your ...</td>\n",
62
+ " </tr>\n",
63
+ " <tr>\n",
64
+ " <th>2</th>\n",
65
+ " <td>0</td>\n",
66
+ " <td>sally i forgot to attach the list of student c...</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>3</th>\n",
70
+ " <td>0</td>\n",
71
+ " <td>original message from swisher stephen sent tue...</td>\n",
72
+ " </tr>\n",
73
+ " <tr>\n",
74
+ " <th>4</th>\n",
75
+ " <td>0</td>\n",
76
+ " <td>h hermann writes h hello hynek speech dispatch...</td>\n",
77
+ " </tr>\n",
78
+ " </tbody>\n",
79
+ "</table>\n",
80
+ "</div>"
81
+ ],
82
+ "text/plain": [
83
+ " label text\n",
84
+ "0 0 subscribe change profile contact u long term e...\n",
85
+ "1 1 hi we have a new opportunity for you and your ...\n",
86
+ "2 0 sally i forgot to attach the list of student c...\n",
87
+ "3 0 original message from swisher stephen sent tue...\n",
88
+ "4 0 h hermann writes h hello hynek speech dispatch..."
89
+ ]
90
+ },
91
+ "execution_count": 4,
92
+ "metadata": {},
93
+ "output_type": "execute_result"
94
+ }
95
+ ],
96
+ "source": [
97
+ "train_df = pd.read_csv('../data/processed/train.csv')\n",
98
+ "test_df = pd.read_csv('../data/processed/test.csv')\n",
99
+ "\n",
100
+ "train_df.head()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 5,
106
+ "id": "bd924063",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "X_train = train_df['text']\n",
111
+ "X_test = test_df['text']\n",
112
+ "y_train = train_df['label']\n",
113
+ "y_test = test_df['label']"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "markdown",
118
+ "id": "17a6bbf8",
119
+ "metadata": {},
120
+ "source": [
121
+ "# Vectorization"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": null,
127
+ "id": "3871f79f",
128
+ "metadata": {},
129
+ "outputs": [],
130
+ "source": [
131
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
132
+ "\n",
133
+ "## Code"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "markdown",
138
+ "id": "f1807de9",
139
+ "metadata": {},
140
+ "source": [
141
+ "# Model Training"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "129e33cf",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "from sklearn.naive_bayes import MultinomialNB\n",
152
+ "\n",
153
+ "## Code"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "id": "7fa92718",
159
+ "metadata": {},
160
+ "source": [
161
+ "## Model Evaluation"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "id": "ba1f46c4",
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "from sklearn.metrics import classification_report \n",
172
+ "\n",
173
+ "## Code"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "c666bfa7",
179
+ "metadata": {},
180
+ "source": [
181
+ "## Model Saving"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "64f48ba0",
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "## Code"
192
+ ]
193
+ }
194
+ ],
195
+ "metadata": {
196
+ "kernelspec": {
197
+ "display_name": ".venv",
198
+ "language": "python",
199
+ "name": "python3"
200
+ },
201
+ "language_info": {
202
+ "codemirror_mode": {
203
+ "name": "ipython",
204
+ "version": 3
205
+ },
206
+ "file_extension": ".py",
207
+ "mimetype": "text/x-python",
208
+ "name": "python",
209
+ "nbconvert_exporter": "python",
210
+ "pygments_lexer": "ipython3",
211
+ "version": "3.12.9"
212
+ }
213
+ },
214
+ "nbformat": 4,
215
+ "nbformat_minor": 5
216
+ }
notebooks/SVM.ipynb ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cf0be37c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Reading Processed Data"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 2,
14
+ "id": "43a52e6b",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import pandas as pd"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "id": "34291c43",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/html": [
30
+ "<div>\n",
31
+ "<style scoped>\n",
32
+ " .dataframe tbody tr th:only-of-type {\n",
33
+ " vertical-align: middle;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe tbody tr th {\n",
37
+ " vertical-align: top;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe thead th {\n",
41
+ " text-align: right;\n",
42
+ " }\n",
43
+ "</style>\n",
44
+ "<table border=\"1\" class=\"dataframe\">\n",
45
+ " <thead>\n",
46
+ " <tr style=\"text-align: right;\">\n",
47
+ " <th></th>\n",
48
+ " <th>label</th>\n",
49
+ " <th>text</th>\n",
50
+ " </tr>\n",
51
+ " </thead>\n",
52
+ " <tbody>\n",
53
+ " <tr>\n",
54
+ " <th>0</th>\n",
55
+ " <td>0</td>\n",
56
+ " <td>subscribe change profile contact u long term e...</td>\n",
57
+ " </tr>\n",
58
+ " <tr>\n",
59
+ " <th>1</th>\n",
60
+ " <td>1</td>\n",
61
+ " <td>hi we have a new opportunity for you and your ...</td>\n",
62
+ " </tr>\n",
63
+ " <tr>\n",
64
+ " <th>2</th>\n",
65
+ " <td>0</td>\n",
66
+ " <td>sally i forgot to attach the list of student c...</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>3</th>\n",
70
+ " <td>0</td>\n",
71
+ " <td>original message from swisher stephen sent tue...</td>\n",
72
+ " </tr>\n",
73
+ " <tr>\n",
74
+ " <th>4</th>\n",
75
+ " <td>0</td>\n",
76
+ " <td>h hermann writes h hello hynek speech dispatch...</td>\n",
77
+ " </tr>\n",
78
+ " </tbody>\n",
79
+ "</table>\n",
80
+ "</div>"
81
+ ],
82
+ "text/plain": [
83
+ " label text\n",
84
+ "0 0 subscribe change profile contact u long term e...\n",
85
+ "1 1 hi we have a new opportunity for you and your ...\n",
86
+ "2 0 sally i forgot to attach the list of student c...\n",
87
+ "3 0 original message from swisher stephen sent tue...\n",
88
+ "4 0 h hermann writes h hello hynek speech dispatch..."
89
+ ]
90
+ },
91
+ "execution_count": 4,
92
+ "metadata": {},
93
+ "output_type": "execute_result"
94
+ }
95
+ ],
96
+ "source": [
97
+ "train_df = pd.read_csv('../data/processed/train.csv')\n",
98
+ "test_df = pd.read_csv('../data/processed/test.csv')\n",
99
+ "\n",
100
+ "train_df.head()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 5,
106
+ "id": "bd924063",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "X_train = train_df['text']\n",
111
+ "X_test = test_df['text']\n",
112
+ "y_train = train_df['label']\n",
113
+ "y_test = test_df['label']"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "markdown",
118
+ "id": "17a6bbf8",
119
+ "metadata": {},
120
+ "source": [
121
+ "# Vectorization"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": null,
127
+ "id": "3871f79f",
128
+ "metadata": {},
129
+ "outputs": [],
130
+ "source": [
131
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
132
+ "\n",
133
+ "## Code"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "markdown",
138
+ "id": "f1807de9",
139
+ "metadata": {},
140
+ "source": [
141
+ "# Model Training"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "129e33cf",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "from sklearn.svm import SVC\n",
152
+ "\n",
153
+ "## Code"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "id": "7fa92718",
159
+ "metadata": {},
160
+ "source": [
161
+ "## Model Evaluation"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "id": "ba1f46c4",
168
+ "metadata": {},
169
+ "outputs": [],
170
+ "source": [
171
+ "from sklearn.metrics import classification_report \n",
172
+ "\n",
173
+ "## Code"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "c666bfa7",
179
+ "metadata": {},
180
+ "source": [
181
+ "## Model Saving"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "64f48ba0",
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "## Code"
192
+ ]
193
+ }
194
+ ],
195
+ "metadata": {
196
+ "kernelspec": {
197
+ "display_name": ".venv",
198
+ "language": "python",
199
+ "name": "python3"
200
+ },
201
+ "language_info": {
202
+ "codemirror_mode": {
203
+ "name": "ipython",
204
+ "version": 3
205
+ },
206
+ "file_extension": ".py",
207
+ "mimetype": "text/x-python",
208
+ "name": "python",
209
+ "nbconvert_exporter": "python",
210
+ "pygments_lexer": "ipython3",
211
+ "version": "3.12.9"
212
+ }
213
+ },
214
+ "nbformat": 4,
215
+ "nbformat_minor": 5
216
+ }
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Data Analysis and ML Libraries
2
+ scikit-learn
3
+ pandas
4
+ matplotlib
5
+ ipykernel
6
+
7
+ ## NLP Libraries
8
+ langdetect
9
+ nltk
10
+ contractions
11
+
12
+ ## UI Library
13
+ streamlit