PranavReddy18 commited on
Commit
d89e69d
·
verified ·
1 Parent(s): cf76166

Upload 2 files

Browse files
Files changed (2) hide show
  1. BBC_NEWS_PREDICTION.ipynb +845 -0
  2. bbc_data.csv +0 -0
BBC_NEWS_PREDICTION.ipynb ADDED
@@ -0,0 +1,845 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 78,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 79,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "df=pd.read_csv(\"C:\\\\Users\\\\saipr\\\\anaconda3\\\\Projects\\\\News_Classification\\\\bbc_data.csv\")"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 80,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/html": [
30
+ "<div>\n",
31
+ "<style scoped>\n",
32
+ " .dataframe tbody tr th:only-of-type {\n",
33
+ " vertical-align: middle;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe tbody tr th {\n",
37
+ " vertical-align: top;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe thead th {\n",
41
+ " text-align: right;\n",
42
+ " }\n",
43
+ "</style>\n",
44
+ "<table border=\"1\" class=\"dataframe\">\n",
45
+ " <thead>\n",
46
+ " <tr style=\"text-align: right;\">\n",
47
+ " <th></th>\n",
48
+ " <th>data</th>\n",
49
+ " <th>labels</th>\n",
50
+ " </tr>\n",
51
+ " </thead>\n",
52
+ " <tbody>\n",
53
+ " <tr>\n",
54
+ " <th>0</th>\n",
55
+ " <td>Musicians to tackle US red tape Musicians gro...</td>\n",
56
+ " <td>entertainment</td>\n",
57
+ " </tr>\n",
58
+ " <tr>\n",
59
+ " <th>1</th>\n",
60
+ " <td>U2s desire to be number one U2, who have won ...</td>\n",
61
+ " <td>entertainment</td>\n",
62
+ " </tr>\n",
63
+ " <tr>\n",
64
+ " <th>2</th>\n",
65
+ " <td>Rocker Doherty in on-stage fight Rock singer ...</td>\n",
66
+ " <td>entertainment</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>3</th>\n",
70
+ " <td>Snicket tops US box office chart The film ada...</td>\n",
71
+ " <td>entertainment</td>\n",
72
+ " </tr>\n",
73
+ " <tr>\n",
74
+ " <th>4</th>\n",
75
+ " <td>Oceans Twelve raids box office Oceans Twelve,...</td>\n",
76
+ " <td>entertainment</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>...</th>\n",
80
+ " <td>...</td>\n",
81
+ " <td>...</td>\n",
82
+ " </tr>\n",
83
+ " <tr>\n",
84
+ " <th>2220</th>\n",
85
+ " <td>Warning over Windows Word files Writing a Mic...</td>\n",
86
+ " <td>tech</td>\n",
87
+ " </tr>\n",
88
+ " <tr>\n",
89
+ " <th>2221</th>\n",
90
+ " <td>Fast lifts rise into record books Two high-sp...</td>\n",
91
+ " <td>tech</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <th>2222</th>\n",
95
+ " <td>Nintendo adds media playing to DS Nintendo is...</td>\n",
96
+ " <td>tech</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>2223</th>\n",
100
+ " <td>Fast moving phone viruses appear Security fir...</td>\n",
101
+ " <td>tech</td>\n",
102
+ " </tr>\n",
103
+ " <tr>\n",
104
+ " <th>2224</th>\n",
105
+ " <td>Hacker threat to Apples iTunes Users of Apple...</td>\n",
106
+ " <td>tech</td>\n",
107
+ " </tr>\n",
108
+ " </tbody>\n",
109
+ "</table>\n",
110
+ "<p>2225 rows × 2 columns</p>\n",
111
+ "</div>"
112
+ ],
113
+ "text/plain": [
114
+ " data labels\n",
115
+ "0 Musicians to tackle US red tape Musicians gro... entertainment\n",
116
+ "1 U2s desire to be number one U2, who have won ... entertainment\n",
117
+ "2 Rocker Doherty in on-stage fight Rock singer ... entertainment\n",
118
+ "3 Snicket tops US box office chart The film ada... entertainment\n",
119
+ "4 Oceans Twelve raids box office Oceans Twelve,... entertainment\n",
120
+ "... ... ...\n",
121
+ "2220 Warning over Windows Word files Writing a Mic... tech\n",
122
+ "2221 Fast lifts rise into record books Two high-sp... tech\n",
123
+ "2222 Nintendo adds media playing to DS Nintendo is... tech\n",
124
+ "2223 Fast moving phone viruses appear Security fir... tech\n",
125
+ "2224 Hacker threat to Apples iTunes Users of Apple... tech\n",
126
+ "\n",
127
+ "[2225 rows x 2 columns]"
128
+ ]
129
+ },
130
+ "execution_count": 80,
131
+ "metadata": {},
132
+ "output_type": "execute_result"
133
+ }
134
+ ],
135
+ "source": [
136
+ "df"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 81,
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "data": {
146
+ "text/plain": [
147
+ "labels\n",
148
+ "sport 511\n",
149
+ "business 510\n",
150
+ "politics 417\n",
151
+ "tech 401\n",
152
+ "entertainment 386\n",
153
+ "Name: count, dtype: int64"
154
+ ]
155
+ },
156
+ "execution_count": 81,
157
+ "metadata": {},
158
+ "output_type": "execute_result"
159
+ }
160
+ ],
161
+ "source": [
162
+ "df['labels'].value_counts()"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 82,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "data": {
172
+ "text/plain": [
173
+ "data 0\n",
174
+ "labels 0\n",
175
+ "dtype: int64"
176
+ ]
177
+ },
178
+ "execution_count": 82,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "df.isnull().sum()"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 83,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "# Split the data into features and target\n",
194
+ "X = df['data']\n",
195
+ "y = df['labels']\n"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 84,
201
+ "metadata": {},
202
+ "outputs": [
203
+ {
204
+ "name": "stderr",
205
+ "output_type": "stream",
206
+ "text": [
207
+ "[nltk_data] Downloading package stopwords to\n",
208
+ "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n",
209
+ "[nltk_data] Package stopwords is already up-to-date!\n",
210
+ "[nltk_data] Downloading package punkt to\n",
211
+ "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n",
212
+ "[nltk_data] Package punkt is already up-to-date!\n"
213
+ ]
214
+ }
215
+ ],
216
+ "source": [
217
+ "import re\n",
218
+ "from nltk.tokenize import word_tokenize\n",
219
+ "from nltk.corpus import stopwords\n",
220
+ "\n",
221
+ "# Ensure required NLTK data is available\n",
222
+ "import nltk\n",
223
+ "nltk.download('stopwords')\n",
224
+ "nltk.download('punkt')\n",
225
+ "\n",
226
+ "# Load stopwords only once\n",
227
+ "stop_words = set(stopwords.words('english'))\n"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": 85,
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "\n",
237
+ "def preprocess_text(text):\n",
238
+ " # Remove punctuation and convert to lower case\n",
239
+ " text = re.sub(r'[^\\w\\s]', '', text.lower())\n",
240
+ " # Tokenize\n",
241
+ " tokens = word_tokenize(text)\n",
242
+ " # Remove stopwords\n",
243
+ " tokens = [word for word in tokens if word not in stop_words]\n",
244
+ " return ' '.join(tokens)\n",
245
+ "\n",
246
+ "# Apply preprocessing to the dataframe\n",
247
+ "df['processed_data'] = df['data'].apply(preprocess_text)\n"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 86,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "from sklearn.model_selection import train_test_split # Import the function\n",
257
+ "\n",
258
+ "# Split the data into training and testing sets\n",
259
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 87,
265
+ "metadata": {},
266
+ "outputs": [],
267
+ "source": [
268
+ "from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer\n",
269
+ "\n",
270
+ "# Convert text data to numerical data using TF-IDF\n",
271
+ "vectorizer = TfidfVectorizer(stop_words='english')\n",
272
+ "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
273
+ "X_test_tfidf = vectorizer.transform(X_test)\n"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 88,
279
+ "metadata": {},
280
+ "outputs": [
281
+ {
282
+ "data": {
283
+ "text/html": [
284
+ "<style>#sk-container-id-3 {\n",
285
+ " /* Definition of color scheme common for light and dark mode */\n",
286
+ " --sklearn-color-text: black;\n",
287
+ " --sklearn-color-line: gray;\n",
288
+ " /* Definition of color scheme for unfitted estimators */\n",
289
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
290
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
291
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
292
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
293
+ " /* Definition of color scheme for fitted estimators */\n",
294
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
295
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
296
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
297
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
298
+ "\n",
299
+ " /* Specific color for light theme */\n",
300
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
301
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
302
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
303
+ " --sklearn-color-icon: #696969;\n",
304
+ "\n",
305
+ " @media (prefers-color-scheme: dark) {\n",
306
+ " /* Redefinition of color scheme for dark theme */\n",
307
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
308
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
309
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
310
+ " --sklearn-color-icon: #878787;\n",
311
+ " }\n",
312
+ "}\n",
313
+ "\n",
314
+ "#sk-container-id-3 {\n",
315
+ " color: var(--sklearn-color-text);\n",
316
+ "}\n",
317
+ "\n",
318
+ "#sk-container-id-3 pre {\n",
319
+ " padding: 0;\n",
320
+ "}\n",
321
+ "\n",
322
+ "#sk-container-id-3 input.sk-hidden--visually {\n",
323
+ " border: 0;\n",
324
+ " clip: rect(1px 1px 1px 1px);\n",
325
+ " clip: rect(1px, 1px, 1px, 1px);\n",
326
+ " height: 1px;\n",
327
+ " margin: -1px;\n",
328
+ " overflow: hidden;\n",
329
+ " padding: 0;\n",
330
+ " position: absolute;\n",
331
+ " width: 1px;\n",
332
+ "}\n",
333
+ "\n",
334
+ "#sk-container-id-3 div.sk-dashed-wrapped {\n",
335
+ " border: 1px dashed var(--sklearn-color-line);\n",
336
+ " margin: 0 0.4em 0.5em 0.4em;\n",
337
+ " box-sizing: border-box;\n",
338
+ " padding-bottom: 0.4em;\n",
339
+ " background-color: var(--sklearn-color-background);\n",
340
+ "}\n",
341
+ "\n",
342
+ "#sk-container-id-3 div.sk-container {\n",
343
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
344
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
345
+ " so we also need the `!important` here to be able to override the\n",
346
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
347
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
348
+ " display: inline-block !important;\n",
349
+ " position: relative;\n",
350
+ "}\n",
351
+ "\n",
352
+ "#sk-container-id-3 div.sk-text-repr-fallback {\n",
353
+ " display: none;\n",
354
+ "}\n",
355
+ "\n",
356
+ "div.sk-parallel-item,\n",
357
+ "div.sk-serial,\n",
358
+ "div.sk-item {\n",
359
+ " /* draw centered vertical line to link estimators */\n",
360
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
361
+ " background-size: 2px 100%;\n",
362
+ " background-repeat: no-repeat;\n",
363
+ " background-position: center center;\n",
364
+ "}\n",
365
+ "\n",
366
+ "/* Parallel-specific style estimator block */\n",
367
+ "\n",
368
+ "#sk-container-id-3 div.sk-parallel-item::after {\n",
369
+ " content: \"\";\n",
370
+ " width: 100%;\n",
371
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
372
+ " flex-grow: 1;\n",
373
+ "}\n",
374
+ "\n",
375
+ "#sk-container-id-3 div.sk-parallel {\n",
376
+ " display: flex;\n",
377
+ " align-items: stretch;\n",
378
+ " justify-content: center;\n",
379
+ " background-color: var(--sklearn-color-background);\n",
380
+ " position: relative;\n",
381
+ "}\n",
382
+ "\n",
383
+ "#sk-container-id-3 div.sk-parallel-item {\n",
384
+ " display: flex;\n",
385
+ " flex-direction: column;\n",
386
+ "}\n",
387
+ "\n",
388
+ "#sk-container-id-3 div.sk-parallel-item:first-child::after {\n",
389
+ " align-self: flex-end;\n",
390
+ " width: 50%;\n",
391
+ "}\n",
392
+ "\n",
393
+ "#sk-container-id-3 div.sk-parallel-item:last-child::after {\n",
394
+ " align-self: flex-start;\n",
395
+ " width: 50%;\n",
396
+ "}\n",
397
+ "\n",
398
+ "#sk-container-id-3 div.sk-parallel-item:only-child::after {\n",
399
+ " width: 0;\n",
400
+ "}\n",
401
+ "\n",
402
+ "/* Serial-specific style estimator block */\n",
403
+ "\n",
404
+ "#sk-container-id-3 div.sk-serial {\n",
405
+ " display: flex;\n",
406
+ " flex-direction: column;\n",
407
+ " align-items: center;\n",
408
+ " background-color: var(--sklearn-color-background);\n",
409
+ " padding-right: 1em;\n",
410
+ " padding-left: 1em;\n",
411
+ "}\n",
412
+ "\n",
413
+ "\n",
414
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
415
+ "clickable and can be expanded/collapsed.\n",
416
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
417
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
418
+ "*/\n",
419
+ "\n",
420
+ "/* Pipeline and ColumnTransformer style (default) */\n",
421
+ "\n",
422
+ "#sk-container-id-3 div.sk-toggleable {\n",
423
+ " /* Default theme specific background. It is overwritten whether we have a\n",
424
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
425
+ " background-color: var(--sklearn-color-background);\n",
426
+ "}\n",
427
+ "\n",
428
+ "/* Toggleable label */\n",
429
+ "#sk-container-id-3 label.sk-toggleable__label {\n",
430
+ " cursor: pointer;\n",
431
+ " display: block;\n",
432
+ " width: 100%;\n",
433
+ " margin-bottom: 0;\n",
434
+ " padding: 0.5em;\n",
435
+ " box-sizing: border-box;\n",
436
+ " text-align: center;\n",
437
+ "}\n",
438
+ "\n",
439
+ "#sk-container-id-3 label.sk-toggleable__label-arrow:before {\n",
440
+ " /* Arrow on the left of the label */\n",
441
+ " content: \"▸\";\n",
442
+ " float: left;\n",
443
+ " margin-right: 0.25em;\n",
444
+ " color: var(--sklearn-color-icon);\n",
445
+ "}\n",
446
+ "\n",
447
+ "#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {\n",
448
+ " color: var(--sklearn-color-text);\n",
449
+ "}\n",
450
+ "\n",
451
+ "/* Toggleable content - dropdown */\n",
452
+ "\n",
453
+ "#sk-container-id-3 div.sk-toggleable__content {\n",
454
+ " max-height: 0;\n",
455
+ " max-width: 0;\n",
456
+ " overflow: hidden;\n",
457
+ " text-align: left;\n",
458
+ " /* unfitted */\n",
459
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
460
+ "}\n",
461
+ "\n",
462
+ "#sk-container-id-3 div.sk-toggleable__content.fitted {\n",
463
+ " /* fitted */\n",
464
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
465
+ "}\n",
466
+ "\n",
467
+ "#sk-container-id-3 div.sk-toggleable__content pre {\n",
468
+ " margin: 0.2em;\n",
469
+ " border-radius: 0.25em;\n",
470
+ " color: var(--sklearn-color-text);\n",
471
+ " /* unfitted */\n",
472
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
473
+ "}\n",
474
+ "\n",
475
+ "#sk-container-id-3 div.sk-toggleable__content.fitted pre {\n",
476
+ " /* unfitted */\n",
477
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
478
+ "}\n",
479
+ "\n",
480
+ "#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
481
+ " /* Expand drop-down */\n",
482
+ " max-height: 200px;\n",
483
+ " max-width: 100%;\n",
484
+ " overflow: auto;\n",
485
+ "}\n",
486
+ "\n",
487
+ "#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
488
+ " content: \"▾\";\n",
489
+ "}\n",
490
+ "\n",
491
+ "/* Pipeline/ColumnTransformer-specific style */\n",
492
+ "\n",
493
+ "#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
494
+ " color: var(--sklearn-color-text);\n",
495
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
496
+ "}\n",
497
+ "\n",
498
+ "#sk-container-id-3 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
499
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
500
+ "}\n",
501
+ "\n",
502
+ "/* Estimator-specific style */\n",
503
+ "\n",
504
+ "/* Colorize estimator box */\n",
505
+ "#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
506
+ " /* unfitted */\n",
507
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
508
+ "}\n",
509
+ "\n",
510
+ "#sk-container-id-3 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
511
+ " /* fitted */\n",
512
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
513
+ "}\n",
514
+ "\n",
515
+ "#sk-container-id-3 div.sk-label label.sk-toggleable__label,\n",
516
+ "#sk-container-id-3 div.sk-label label {\n",
517
+ " /* The background is the default theme color */\n",
518
+ " color: var(--sklearn-color-text-on-default-background);\n",
519
+ "}\n",
520
+ "\n",
521
+ "/* On hover, darken the color of the background */\n",
522
+ "#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {\n",
523
+ " color: var(--sklearn-color-text);\n",
524
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
525
+ "}\n",
526
+ "\n",
527
+ "/* Label box, darken color on hover, fitted */\n",
528
+ "#sk-container-id-3 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
529
+ " color: var(--sklearn-color-text);\n",
530
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
531
+ "}\n",
532
+ "\n",
533
+ "/* Estimator label */\n",
534
+ "\n",
535
+ "#sk-container-id-3 div.sk-label label {\n",
536
+ " font-family: monospace;\n",
537
+ " font-weight: bold;\n",
538
+ " display: inline-block;\n",
539
+ " line-height: 1.2em;\n",
540
+ "}\n",
541
+ "\n",
542
+ "#sk-container-id-3 div.sk-label-container {\n",
543
+ " text-align: center;\n",
544
+ "}\n",
545
+ "\n",
546
+ "/* Estimator-specific */\n",
547
+ "#sk-container-id-3 div.sk-estimator {\n",
548
+ " font-family: monospace;\n",
549
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
550
+ " border-radius: 0.25em;\n",
551
+ " box-sizing: border-box;\n",
552
+ " margin-bottom: 0.5em;\n",
553
+ " /* unfitted */\n",
554
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
555
+ "}\n",
556
+ "\n",
557
+ "#sk-container-id-3 div.sk-estimator.fitted {\n",
558
+ " /* fitted */\n",
559
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
560
+ "}\n",
561
+ "\n",
562
+ "/* on hover */\n",
563
+ "#sk-container-id-3 div.sk-estimator:hover {\n",
564
+ " /* unfitted */\n",
565
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
566
+ "}\n",
567
+ "\n",
568
+ "#sk-container-id-3 div.sk-estimator.fitted:hover {\n",
569
+ " /* fitted */\n",
570
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
571
+ "}\n",
572
+ "\n",
573
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
574
+ "\n",
575
+ "/* Common style for \"i\" and \"?\" */\n",
576
+ "\n",
577
+ ".sk-estimator-doc-link,\n",
578
+ "a:link.sk-estimator-doc-link,\n",
579
+ "a:visited.sk-estimator-doc-link {\n",
580
+ " float: right;\n",
581
+ " font-size: smaller;\n",
582
+ " line-height: 1em;\n",
583
+ " font-family: monospace;\n",
584
+ " background-color: var(--sklearn-color-background);\n",
585
+ " border-radius: 1em;\n",
586
+ " height: 1em;\n",
587
+ " width: 1em;\n",
588
+ " text-decoration: none !important;\n",
589
+ " margin-left: 1ex;\n",
590
+ " /* unfitted */\n",
591
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
592
+ " color: var(--sklearn-color-unfitted-level-1);\n",
593
+ "}\n",
594
+ "\n",
595
+ ".sk-estimator-doc-link.fitted,\n",
596
+ "a:link.sk-estimator-doc-link.fitted,\n",
597
+ "a:visited.sk-estimator-doc-link.fitted {\n",
598
+ " /* fitted */\n",
599
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
600
+ " color: var(--sklearn-color-fitted-level-1);\n",
601
+ "}\n",
602
+ "\n",
603
+ "/* On hover */\n",
604
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
605
+ ".sk-estimator-doc-link:hover,\n",
606
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
607
+ ".sk-estimator-doc-link:hover {\n",
608
+ " /* unfitted */\n",
609
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
610
+ " color: var(--sklearn-color-background);\n",
611
+ " text-decoration: none;\n",
612
+ "}\n",
613
+ "\n",
614
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
615
+ ".sk-estimator-doc-link.fitted:hover,\n",
616
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
617
+ ".sk-estimator-doc-link.fitted:hover {\n",
618
+ " /* fitted */\n",
619
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
620
+ " color: var(--sklearn-color-background);\n",
621
+ " text-decoration: none;\n",
622
+ "}\n",
623
+ "\n",
624
+ "/* Span, style for the box shown on hovering the info icon */\n",
625
+ ".sk-estimator-doc-link span {\n",
626
+ " display: none;\n",
627
+ " z-index: 9999;\n",
628
+ " position: relative;\n",
629
+ " font-weight: normal;\n",
630
+ " right: .2ex;\n",
631
+ " padding: .5ex;\n",
632
+ " margin: .5ex;\n",
633
+ " width: min-content;\n",
634
+ " min-width: 20ex;\n",
635
+ " max-width: 50ex;\n",
636
+ " color: var(--sklearn-color-text);\n",
637
+ " box-shadow: 2pt 2pt 4pt #999;\n",
638
+ " /* unfitted */\n",
639
+ " background: var(--sklearn-color-unfitted-level-0);\n",
640
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
641
+ "}\n",
642
+ "\n",
643
+ ".sk-estimator-doc-link.fitted span {\n",
644
+ " /* fitted */\n",
645
+ " background: var(--sklearn-color-fitted-level-0);\n",
646
+ " border: var(--sklearn-color-fitted-level-3);\n",
647
+ "}\n",
648
+ "\n",
649
+ ".sk-estimator-doc-link:hover span {\n",
650
+ " display: block;\n",
651
+ "}\n",
652
+ "\n",
653
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
654
+ "\n",
655
+ "#sk-container-id-3 a.estimator_doc_link {\n",
656
+ " float: right;\n",
657
+ " font-size: 1rem;\n",
658
+ " line-height: 1em;\n",
659
+ " font-family: monospace;\n",
660
+ " background-color: var(--sklearn-color-background);\n",
661
+ " border-radius: 1rem;\n",
662
+ " height: 1rem;\n",
663
+ " width: 1rem;\n",
664
+ " text-decoration: none;\n",
665
+ " /* unfitted */\n",
666
+ " color: var(--sklearn-color-unfitted-level-1);\n",
667
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
668
+ "}\n",
669
+ "\n",
670
+ "#sk-container-id-3 a.estimator_doc_link.fitted {\n",
671
+ " /* fitted */\n",
672
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
673
+ " color: var(--sklearn-color-fitted-level-1);\n",
674
+ "}\n",
675
+ "\n",
676
+ "/* On hover */\n",
677
+ "#sk-container-id-3 a.estimator_doc_link:hover {\n",
678
+ " /* unfitted */\n",
679
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
680
+ " color: var(--sklearn-color-background);\n",
681
+ " text-decoration: none;\n",
682
+ "}\n",
683
+ "\n",
684
+ "#sk-container-id-3 a.estimator_doc_link.fitted:hover {\n",
685
+ " /* fitted */\n",
686
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
687
+ "}\n",
688
+ "</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" checked><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div>"
689
+ ],
690
+ "text/plain": [
691
+ "LogisticRegression()"
692
+ ]
693
+ },
694
+ "execution_count": 88,
695
+ "metadata": {},
696
+ "output_type": "execute_result"
697
+ }
698
+ ],
699
+ "source": [
700
+ "from sklearn.linear_model import LogisticRegression # Import LogisticRegression\n",
701
+ "\n",
702
+ "# Train a logistic regression model\n",
703
+ "model = LogisticRegression()\n",
704
+ "model.fit(X_train_tfidf, y_train)"
705
+ ]
706
+ },
707
+ {
708
+ "cell_type": "code",
709
+ "execution_count": 89,
710
+ "metadata": {},
711
+ "outputs": [],
712
+ "source": [
713
+ "# Make predictions on the test set\n",
714
+ "y_pred = model.predict(X_test_tfidf)"
715
+ ]
716
+ },
717
+ {
718
+ "cell_type": "code",
719
+ "execution_count": 90,
720
+ "metadata": {},
721
+ "outputs": [
722
+ {
723
+ "name": "stdout",
724
+ "output_type": "stream",
725
+ "text": [
726
+ "Accuracy: 0.9887640449438202\n",
727
+ "Classification Report:\n",
728
+ " precision recall f1-score support\n",
729
+ "\n",
730
+ " business 0.97 1.00 0.99 103\n",
731
+ "entertainment 1.00 0.98 0.99 84\n",
732
+ " politics 0.98 0.99 0.98 80\n",
733
+ " sport 1.00 0.99 0.99 98\n",
734
+ " tech 1.00 0.99 0.99 80\n",
735
+ "\n",
736
+ " accuracy 0.99 445\n",
737
+ " macro avg 0.99 0.99 0.99 445\n",
738
+ " weighted avg 0.99 0.99 0.99 445\n",
739
+ "\n"
740
+ ]
741
+ }
742
+ ],
743
+ "source": [
744
+ "from sklearn.metrics import accuracy_score, classification_report # Import evaluation metrics\n",
745
+ "\n",
746
+ "# Evaluate the model\n",
747
+ "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
748
+ "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n"
749
+ ]
750
+ },
751
+ {
752
+ "cell_type": "code",
753
+ "execution_count": 91,
754
+ "metadata": {},
755
+ "outputs": [
756
+ {
757
+ "name": "stdout",
758
+ "output_type": "stream",
759
+ "text": [
760
+ "confusion_matrix:\n",
761
+ " [[103 0 0 0 0]\n",
762
+ " [ 0 82 2 0 0]\n",
763
+ " [ 1 0 79 0 0]\n",
764
+ " [ 1 0 0 97 0]\n",
765
+ " [ 1 0 0 0 79]]\n"
766
+ ]
767
+ }
768
+ ],
769
+ "source": [
770
+ "from sklearn.metrics import confusion_matrix\n",
771
+ "print(\"confusion_matrix:\\n\", confusion_matrix(y_test, y_pred))\n"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 92,
777
+ "metadata": {},
778
+ "outputs": [
779
+ {
780
+ "data": {
781
+ "text/plain": [
782
+ "['model.pkl']"
783
+ ]
784
+ },
785
+ "execution_count": 92,
786
+ "metadata": {},
787
+ "output_type": "execute_result"
788
+ }
789
+ ],
790
+ "source": [
791
+ "import joblib\n",
792
+ "joblib.dump(model,'model.pkl')"
793
+ ]
794
+ },
795
+ {
796
+ "cell_type": "code",
797
+ "execution_count": 93,
798
+ "metadata": {},
799
+ "outputs": [
800
+ {
801
+ "data": {
802
+ "text/plain": [
803
+ "['vectorizer.pkl']"
804
+ ]
805
+ },
806
+ "execution_count": 93,
807
+ "metadata": {},
808
+ "output_type": "execute_result"
809
+ }
810
+ ],
811
+ "source": [
812
+ "import joblib\n",
813
+ "joblib.dump(vectorizer,'vectorizer.pkl')"
814
+ ]
815
+ },
816
+ {
817
+ "cell_type": "code",
818
+ "execution_count": null,
819
+ "metadata": {},
820
+ "outputs": [],
821
+ "source": []
822
+ }
823
+ ],
824
+ "metadata": {
825
+ "kernelspec": {
826
+ "display_name": "base",
827
+ "language": "python",
828
+ "name": "python3"
829
+ },
830
+ "language_info": {
831
+ "codemirror_mode": {
832
+ "name": "ipython",
833
+ "version": 3
834
+ },
835
+ "file_extension": ".py",
836
+ "mimetype": "text/x-python",
837
+ "name": "python",
838
+ "nbconvert_exporter": "python",
839
+ "pygments_lexer": "ipython3",
840
+ "version": "3.10.9"
841
+ }
842
+ },
843
+ "nbformat": 4,
844
+ "nbformat_minor": 2
845
+ }
bbc_data.csv ADDED
The diff for this file is too large to render. See raw diff