PranavReddy18 commited on
Commit
cf76166
·
verified ·
1 Parent(s): 2819d26

Delete BBC_NEWS_PREDICTION.ipynb

Browse files
Files changed (1) hide show
  1. BBC_NEWS_PREDICTION.ipynb +0 -845
BBC_NEWS_PREDICTION.ipynb DELETED
@@ -1,845 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 60,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import numpy as np"
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": 61,
16
- "metadata": {},
17
- "outputs": [],
18
- "source": [
19
- "df=pd.read_csv(\"C:\\\\Users\\\\saipr\\\\Downloads\\\\bbc_data.csv\\\\bbc_data.csv\")"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": 62,
25
- "metadata": {},
26
- "outputs": [
27
- {
28
- "data": {
29
- "text/html": [
30
- "<div>\n",
31
- "<style scoped>\n",
32
- " .dataframe tbody tr th:only-of-type {\n",
33
- " vertical-align: middle;\n",
34
- " }\n",
35
- "\n",
36
- " .dataframe tbody tr th {\n",
37
- " vertical-align: top;\n",
38
- " }\n",
39
- "\n",
40
- " .dataframe thead th {\n",
41
- " text-align: right;\n",
42
- " }\n",
43
- "</style>\n",
44
- "<table border=\"1\" class=\"dataframe\">\n",
45
- " <thead>\n",
46
- " <tr style=\"text-align: right;\">\n",
47
- " <th></th>\n",
48
- " <th>data</th>\n",
49
- " <th>labels</th>\n",
50
- " </tr>\n",
51
- " </thead>\n",
52
- " <tbody>\n",
53
- " <tr>\n",
54
- " <th>0</th>\n",
55
- " <td>Musicians to tackle US red tape Musicians gro...</td>\n",
56
- " <td>entertainment</td>\n",
57
- " </tr>\n",
58
- " <tr>\n",
59
- " <th>1</th>\n",
60
- " <td>U2s desire to be number one U2, who have won ...</td>\n",
61
- " <td>entertainment</td>\n",
62
- " </tr>\n",
63
- " <tr>\n",
64
- " <th>2</th>\n",
65
- " <td>Rocker Doherty in on-stage fight Rock singer ...</td>\n",
66
- " <td>entertainment</td>\n",
67
- " </tr>\n",
68
- " <tr>\n",
69
- " <th>3</th>\n",
70
- " <td>Snicket tops US box office chart The film ada...</td>\n",
71
- " <td>entertainment</td>\n",
72
- " </tr>\n",
73
- " <tr>\n",
74
- " <th>4</th>\n",
75
- " <td>Oceans Twelve raids box office Oceans Twelve,...</td>\n",
76
- " <td>entertainment</td>\n",
77
- " </tr>\n",
78
- " <tr>\n",
79
- " <th>...</th>\n",
80
- " <td>...</td>\n",
81
- " <td>...</td>\n",
82
- " </tr>\n",
83
- " <tr>\n",
84
- " <th>2220</th>\n",
85
- " <td>Warning over Windows Word files Writing a Mic...</td>\n",
86
- " <td>tech</td>\n",
87
- " </tr>\n",
88
- " <tr>\n",
89
- " <th>2221</th>\n",
90
- " <td>Fast lifts rise into record books Two high-sp...</td>\n",
91
- " <td>tech</td>\n",
92
- " </tr>\n",
93
- " <tr>\n",
94
- " <th>2222</th>\n",
95
- " <td>Nintendo adds media playing to DS Nintendo is...</td>\n",
96
- " <td>tech</td>\n",
97
- " </tr>\n",
98
- " <tr>\n",
99
- " <th>2223</th>\n",
100
- " <td>Fast moving phone viruses appear Security fir...</td>\n",
101
- " <td>tech</td>\n",
102
- " </tr>\n",
103
- " <tr>\n",
104
- " <th>2224</th>\n",
105
- " <td>Hacker threat to Apples iTunes Users of Apple...</td>\n",
106
- " <td>tech</td>\n",
107
- " </tr>\n",
108
- " </tbody>\n",
109
- "</table>\n",
110
- "<p>2225 rows × 2 columns</p>\n",
111
- "</div>"
112
- ],
113
- "text/plain": [
114
- " data labels\n",
115
- "0 Musicians to tackle US red tape Musicians gro... entertainment\n",
116
- "1 U2s desire to be number one U2, who have won ... entertainment\n",
117
- "2 Rocker Doherty in on-stage fight Rock singer ... entertainment\n",
118
- "3 Snicket tops US box office chart The film ada... entertainment\n",
119
- "4 Oceans Twelve raids box office Oceans Twelve,... entertainment\n",
120
- "... ... ...\n",
121
- "2220 Warning over Windows Word files Writing a Mic... tech\n",
122
- "2221 Fast lifts rise into record books Two high-sp... tech\n",
123
- "2222 Nintendo adds media playing to DS Nintendo is... tech\n",
124
- "2223 Fast moving phone viruses appear Security fir... tech\n",
125
- "2224 Hacker threat to Apples iTunes Users of Apple... tech\n",
126
- "\n",
127
- "[2225 rows x 2 columns]"
128
- ]
129
- },
130
- "execution_count": 62,
131
- "metadata": {},
132
- "output_type": "execute_result"
133
- }
134
- ],
135
- "source": [
136
- "df"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": 63,
142
- "metadata": {},
143
- "outputs": [
144
- {
145
- "data": {
146
- "text/plain": [
147
- "labels\n",
148
- "sport 511\n",
149
- "business 510\n",
150
- "politics 417\n",
151
- "tech 401\n",
152
- "entertainment 386\n",
153
- "Name: count, dtype: int64"
154
- ]
155
- },
156
- "execution_count": 63,
157
- "metadata": {},
158
- "output_type": "execute_result"
159
- }
160
- ],
161
- "source": [
162
- "df['labels'].value_counts()"
163
- ]
164
- },
165
- {
166
- "cell_type": "code",
167
- "execution_count": 64,
168
- "metadata": {},
169
- "outputs": [
170
- {
171
- "data": {
172
- "text/plain": [
173
- "data 0\n",
174
- "labels 0\n",
175
- "dtype: int64"
176
- ]
177
- },
178
- "execution_count": 64,
179
- "metadata": {},
180
- "output_type": "execute_result"
181
- }
182
- ],
183
- "source": [
184
- "df.isnull().sum()"
185
- ]
186
- },
187
- {
188
- "cell_type": "code",
189
- "execution_count": 65,
190
- "metadata": {},
191
- "outputs": [],
192
- "source": [
193
- "# Split the data into features and target\n",
194
- "X = df['data']\n",
195
- "y = df['labels']\n"
196
- ]
197
- },
198
- {
199
- "cell_type": "code",
200
- "execution_count": 66,
201
- "metadata": {},
202
- "outputs": [
203
- {
204
- "name": "stderr",
205
- "output_type": "stream",
206
- "text": [
207
- "[nltk_data] Downloading package stopwords to\n",
208
- "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n",
209
- "[nltk_data] Package stopwords is already up-to-date!\n",
210
- "[nltk_data] Downloading package punkt to\n",
211
- "[nltk_data] C:\\Users\\saipr\\AppData\\Roaming\\nltk_data...\n",
212
- "[nltk_data] Package punkt is already up-to-date!\n"
213
- ]
214
- }
215
- ],
216
- "source": [
217
- "import re\n",
218
- "from nltk.tokenize import word_tokenize\n",
219
- "from nltk.corpus import stopwords\n",
220
- "\n",
221
- "# Ensure required NLTK data is available\n",
222
- "import nltk\n",
223
- "nltk.download('stopwords')\n",
224
- "nltk.download('punkt')\n",
225
- "\n",
226
- "# Load stopwords only once\n",
227
- "stop_words = set(stopwords.words('english'))\n"
228
- ]
229
- },
230
- {
231
- "cell_type": "code",
232
- "execution_count": 67,
233
- "metadata": {},
234
- "outputs": [],
235
- "source": [
236
- "\n",
237
- "def preprocess_text(text):\n",
238
- " # Remove punctuation and convert to lower case\n",
239
- " text = re.sub(r'[^\\w\\s]', '', text.lower())\n",
240
- " # Tokenize\n",
241
- " tokens = word_tokenize(text)\n",
242
- " # Remove stopwords\n",
243
- " tokens = [word for word in tokens if word not in stop_words]\n",
244
- " return ' '.join(tokens)\n",
245
- "\n",
246
- "# Apply preprocessing to the dataframe\n",
247
- "df['processed_data'] = df['data'].apply(preprocess_text)\n"
248
- ]
249
- },
250
- {
251
- "cell_type": "code",
252
- "execution_count": 68,
253
- "metadata": {},
254
- "outputs": [],
255
- "source": [
256
- "from sklearn.model_selection import train_test_split # Import the function\n",
257
- "\n",
258
- "# Split the data into training and testing sets\n",
259
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
260
- ]
261
- },
262
- {
263
- "cell_type": "code",
264
- "execution_count": 69,
265
- "metadata": {},
266
- "outputs": [],
267
- "source": [
268
- "from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer\n",
269
- "\n",
270
- "# Convert text data to numerical data using TF-IDF\n",
271
- "vectorizer = TfidfVectorizer(stop_words='english')\n",
272
- "X_train_tfidf = vectorizer.fit_transform(X_train)\n",
273
- "X_test_tfidf = vectorizer.transform(X_test)\n"
274
- ]
275
- },
276
- {
277
- "cell_type": "code",
278
- "execution_count": 70,
279
- "metadata": {},
280
- "outputs": [
281
- {
282
- "data": {
283
- "text/html": [
284
- "<style>#sk-container-id-2 {\n",
285
- " /* Definition of color scheme common for light and dark mode */\n",
286
- " --sklearn-color-text: black;\n",
287
- " --sklearn-color-line: gray;\n",
288
- " /* Definition of color scheme for unfitted estimators */\n",
289
- " --sklearn-color-unfitted-level-0: #fff5e6;\n",
290
- " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
291
- " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
292
- " --sklearn-color-unfitted-level-3: chocolate;\n",
293
- " /* Definition of color scheme for fitted estimators */\n",
294
- " --sklearn-color-fitted-level-0: #f0f8ff;\n",
295
- " --sklearn-color-fitted-level-1: #d4ebff;\n",
296
- " --sklearn-color-fitted-level-2: #b3dbfd;\n",
297
- " --sklearn-color-fitted-level-3: cornflowerblue;\n",
298
- "\n",
299
- " /* Specific color for light theme */\n",
300
- " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
301
- " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
302
- " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
303
- " --sklearn-color-icon: #696969;\n",
304
- "\n",
305
- " @media (prefers-color-scheme: dark) {\n",
306
- " /* Redefinition of color scheme for dark theme */\n",
307
- " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
308
- " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
309
- " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
310
- " --sklearn-color-icon: #878787;\n",
311
- " }\n",
312
- "}\n",
313
- "\n",
314
- "#sk-container-id-2 {\n",
315
- " color: var(--sklearn-color-text);\n",
316
- "}\n",
317
- "\n",
318
- "#sk-container-id-2 pre {\n",
319
- " padding: 0;\n",
320
- "}\n",
321
- "\n",
322
- "#sk-container-id-2 input.sk-hidden--visually {\n",
323
- " border: 0;\n",
324
- " clip: rect(1px 1px 1px 1px);\n",
325
- " clip: rect(1px, 1px, 1px, 1px);\n",
326
- " height: 1px;\n",
327
- " margin: -1px;\n",
328
- " overflow: hidden;\n",
329
- " padding: 0;\n",
330
- " position: absolute;\n",
331
- " width: 1px;\n",
332
- "}\n",
333
- "\n",
334
- "#sk-container-id-2 div.sk-dashed-wrapped {\n",
335
- " border: 1px dashed var(--sklearn-color-line);\n",
336
- " margin: 0 0.4em 0.5em 0.4em;\n",
337
- " box-sizing: border-box;\n",
338
- " padding-bottom: 0.4em;\n",
339
- " background-color: var(--sklearn-color-background);\n",
340
- "}\n",
341
- "\n",
342
- "#sk-container-id-2 div.sk-container {\n",
343
- " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
344
- " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
345
- " so we also need the `!important` here to be able to override the\n",
346
- " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
347
- " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
348
- " display: inline-block !important;\n",
349
- " position: relative;\n",
350
- "}\n",
351
- "\n",
352
- "#sk-container-id-2 div.sk-text-repr-fallback {\n",
353
- " display: none;\n",
354
- "}\n",
355
- "\n",
356
- "div.sk-parallel-item,\n",
357
- "div.sk-serial,\n",
358
- "div.sk-item {\n",
359
- " /* draw centered vertical line to link estimators */\n",
360
- " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
361
- " background-size: 2px 100%;\n",
362
- " background-repeat: no-repeat;\n",
363
- " background-position: center center;\n",
364
- "}\n",
365
- "\n",
366
- "/* Parallel-specific style estimator block */\n",
367
- "\n",
368
- "#sk-container-id-2 div.sk-parallel-item::after {\n",
369
- " content: \"\";\n",
370
- " width: 100%;\n",
371
- " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
372
- " flex-grow: 1;\n",
373
- "}\n",
374
- "\n",
375
- "#sk-container-id-2 div.sk-parallel {\n",
376
- " display: flex;\n",
377
- " align-items: stretch;\n",
378
- " justify-content: center;\n",
379
- " background-color: var(--sklearn-color-background);\n",
380
- " position: relative;\n",
381
- "}\n",
382
- "\n",
383
- "#sk-container-id-2 div.sk-parallel-item {\n",
384
- " display: flex;\n",
385
- " flex-direction: column;\n",
386
- "}\n",
387
- "\n",
388
- "#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
389
- " align-self: flex-end;\n",
390
- " width: 50%;\n",
391
- "}\n",
392
- "\n",
393
- "#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
394
- " align-self: flex-start;\n",
395
- " width: 50%;\n",
396
- "}\n",
397
- "\n",
398
- "#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
399
- " width: 0;\n",
400
- "}\n",
401
- "\n",
402
- "/* Serial-specific style estimator block */\n",
403
- "\n",
404
- "#sk-container-id-2 div.sk-serial {\n",
405
- " display: flex;\n",
406
- " flex-direction: column;\n",
407
- " align-items: center;\n",
408
- " background-color: var(--sklearn-color-background);\n",
409
- " padding-right: 1em;\n",
410
- " padding-left: 1em;\n",
411
- "}\n",
412
- "\n",
413
- "\n",
414
- "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
415
- "clickable and can be expanded/collapsed.\n",
416
- "- Pipeline and ColumnTransformer use this feature and define the default style\n",
417
- "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
418
- "*/\n",
419
- "\n",
420
- "/* Pipeline and ColumnTransformer style (default) */\n",
421
- "\n",
422
- "#sk-container-id-2 div.sk-toggleable {\n",
423
- " /* Default theme specific background. It is overwritten whether we have a\n",
424
- " specific estimator or a Pipeline/ColumnTransformer */\n",
425
- " background-color: var(--sklearn-color-background);\n",
426
- "}\n",
427
- "\n",
428
- "/* Toggleable label */\n",
429
- "#sk-container-id-2 label.sk-toggleable__label {\n",
430
- " cursor: pointer;\n",
431
- " display: block;\n",
432
- " width: 100%;\n",
433
- " margin-bottom: 0;\n",
434
- " padding: 0.5em;\n",
435
- " box-sizing: border-box;\n",
436
- " text-align: center;\n",
437
- "}\n",
438
- "\n",
439
- "#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
440
- " /* Arrow on the left of the label */\n",
441
- " content: \"▸\";\n",
442
- " float: left;\n",
443
- " margin-right: 0.25em;\n",
444
- " color: var(--sklearn-color-icon);\n",
445
- "}\n",
446
- "\n",
447
- "#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
448
- " color: var(--sklearn-color-text);\n",
449
- "}\n",
450
- "\n",
451
- "/* Toggleable content - dropdown */\n",
452
- "\n",
453
- "#sk-container-id-2 div.sk-toggleable__content {\n",
454
- " max-height: 0;\n",
455
- " max-width: 0;\n",
456
- " overflow: hidden;\n",
457
- " text-align: left;\n",
458
- " /* unfitted */\n",
459
- " background-color: var(--sklearn-color-unfitted-level-0);\n",
460
- "}\n",
461
- "\n",
462
- "#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
463
- " /* fitted */\n",
464
- " background-color: var(--sklearn-color-fitted-level-0);\n",
465
- "}\n",
466
- "\n",
467
- "#sk-container-id-2 div.sk-toggleable__content pre {\n",
468
- " margin: 0.2em;\n",
469
- " border-radius: 0.25em;\n",
470
- " color: var(--sklearn-color-text);\n",
471
- " /* unfitted */\n",
472
- " background-color: var(--sklearn-color-unfitted-level-0);\n",
473
- "}\n",
474
- "\n",
475
- "#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
476
- " /* unfitted */\n",
477
- " background-color: var(--sklearn-color-fitted-level-0);\n",
478
- "}\n",
479
- "\n",
480
- "#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
481
- " /* Expand drop-down */\n",
482
- " max-height: 200px;\n",
483
- " max-width: 100%;\n",
484
- " overflow: auto;\n",
485
- "}\n",
486
- "\n",
487
- "#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
488
- " content: \"▾\";\n",
489
- "}\n",
490
- "\n",
491
- "/* Pipeline/ColumnTransformer-specific style */\n",
492
- "\n",
493
- "#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
494
- " color: var(--sklearn-color-text);\n",
495
- " background-color: var(--sklearn-color-unfitted-level-2);\n",
496
- "}\n",
497
- "\n",
498
- "#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
499
- " background-color: var(--sklearn-color-fitted-level-2);\n",
500
- "}\n",
501
- "\n",
502
- "/* Estimator-specific style */\n",
503
- "\n",
504
- "/* Colorize estimator box */\n",
505
- "#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
506
- " /* unfitted */\n",
507
- " background-color: var(--sklearn-color-unfitted-level-2);\n",
508
- "}\n",
509
- "\n",
510
- "#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
511
- " /* fitted */\n",
512
- " background-color: var(--sklearn-color-fitted-level-2);\n",
513
- "}\n",
514
- "\n",
515
- "#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
516
- "#sk-container-id-2 div.sk-label label {\n",
517
- " /* The background is the default theme color */\n",
518
- " color: var(--sklearn-color-text-on-default-background);\n",
519
- "}\n",
520
- "\n",
521
- "/* On hover, darken the color of the background */\n",
522
- "#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
523
- " color: var(--sklearn-color-text);\n",
524
- " background-color: var(--sklearn-color-unfitted-level-2);\n",
525
- "}\n",
526
- "\n",
527
- "/* Label box, darken color on hover, fitted */\n",
528
- "#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
529
- " color: var(--sklearn-color-text);\n",
530
- " background-color: var(--sklearn-color-fitted-level-2);\n",
531
- "}\n",
532
- "\n",
533
- "/* Estimator label */\n",
534
- "\n",
535
- "#sk-container-id-2 div.sk-label label {\n",
536
- " font-family: monospace;\n",
537
- " font-weight: bold;\n",
538
- " display: inline-block;\n",
539
- " line-height: 1.2em;\n",
540
- "}\n",
541
- "\n",
542
- "#sk-container-id-2 div.sk-label-container {\n",
543
- " text-align: center;\n",
544
- "}\n",
545
- "\n",
546
- "/* Estimator-specific */\n",
547
- "#sk-container-id-2 div.sk-estimator {\n",
548
- " font-family: monospace;\n",
549
- " border: 1px dotted var(--sklearn-color-border-box);\n",
550
- " border-radius: 0.25em;\n",
551
- " box-sizing: border-box;\n",
552
- " margin-bottom: 0.5em;\n",
553
- " /* unfitted */\n",
554
- " background-color: var(--sklearn-color-unfitted-level-0);\n",
555
- "}\n",
556
- "\n",
557
- "#sk-container-id-2 div.sk-estimator.fitted {\n",
558
- " /* fitted */\n",
559
- " background-color: var(--sklearn-color-fitted-level-0);\n",
560
- "}\n",
561
- "\n",
562
- "/* on hover */\n",
563
- "#sk-container-id-2 div.sk-estimator:hover {\n",
564
- " /* unfitted */\n",
565
- " background-color: var(--sklearn-color-unfitted-level-2);\n",
566
- "}\n",
567
- "\n",
568
- "#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
569
- " /* fitted */\n",
570
- " background-color: var(--sklearn-color-fitted-level-2);\n",
571
- "}\n",
572
- "\n",
573
- "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
574
- "\n",
575
- "/* Common style for \"i\" and \"?\" */\n",
576
- "\n",
577
- ".sk-estimator-doc-link,\n",
578
- "a:link.sk-estimator-doc-link,\n",
579
- "a:visited.sk-estimator-doc-link {\n",
580
- " float: right;\n",
581
- " font-size: smaller;\n",
582
- " line-height: 1em;\n",
583
- " font-family: monospace;\n",
584
- " background-color: var(--sklearn-color-background);\n",
585
- " border-radius: 1em;\n",
586
- " height: 1em;\n",
587
- " width: 1em;\n",
588
- " text-decoration: none !important;\n",
589
- " margin-left: 1ex;\n",
590
- " /* unfitted */\n",
591
- " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
592
- " color: var(--sklearn-color-unfitted-level-1);\n",
593
- "}\n",
594
- "\n",
595
- ".sk-estimator-doc-link.fitted,\n",
596
- "a:link.sk-estimator-doc-link.fitted,\n",
597
- "a:visited.sk-estimator-doc-link.fitted {\n",
598
- " /* fitted */\n",
599
- " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
600
- " color: var(--sklearn-color-fitted-level-1);\n",
601
- "}\n",
602
- "\n",
603
- "/* On hover */\n",
604
- "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
605
- ".sk-estimator-doc-link:hover,\n",
606
- "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
607
- ".sk-estimator-doc-link:hover {\n",
608
- " /* unfitted */\n",
609
- " background-color: var(--sklearn-color-unfitted-level-3);\n",
610
- " color: var(--sklearn-color-background);\n",
611
- " text-decoration: none;\n",
612
- "}\n",
613
- "\n",
614
- "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
615
- ".sk-estimator-doc-link.fitted:hover,\n",
616
- "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
617
- ".sk-estimator-doc-link.fitted:hover {\n",
618
- " /* fitted */\n",
619
- " background-color: var(--sklearn-color-fitted-level-3);\n",
620
- " color: var(--sklearn-color-background);\n",
621
- " text-decoration: none;\n",
622
- "}\n",
623
- "\n",
624
- "/* Span, style for the box shown on hovering the info icon */\n",
625
- ".sk-estimator-doc-link span {\n",
626
- " display: none;\n",
627
- " z-index: 9999;\n",
628
- " position: relative;\n",
629
- " font-weight: normal;\n",
630
- " right: .2ex;\n",
631
- " padding: .5ex;\n",
632
- " margin: .5ex;\n",
633
- " width: min-content;\n",
634
- " min-width: 20ex;\n",
635
- " max-width: 50ex;\n",
636
- " color: var(--sklearn-color-text);\n",
637
- " box-shadow: 2pt 2pt 4pt #999;\n",
638
- " /* unfitted */\n",
639
- " background: var(--sklearn-color-unfitted-level-0);\n",
640
- " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
641
- "}\n",
642
- "\n",
643
- ".sk-estimator-doc-link.fitted span {\n",
644
- " /* fitted */\n",
645
- " background: var(--sklearn-color-fitted-level-0);\n",
646
- " border: var(--sklearn-color-fitted-level-3);\n",
647
- "}\n",
648
- "\n",
649
- ".sk-estimator-doc-link:hover span {\n",
650
- " display: block;\n",
651
- "}\n",
652
- "\n",
653
- "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
654
- "\n",
655
- "#sk-container-id-2 a.estimator_doc_link {\n",
656
- " float: right;\n",
657
- " font-size: 1rem;\n",
658
- " line-height: 1em;\n",
659
- " font-family: monospace;\n",
660
- " background-color: var(--sklearn-color-background);\n",
661
- " border-radius: 1rem;\n",
662
- " height: 1rem;\n",
663
- " width: 1rem;\n",
664
- " text-decoration: none;\n",
665
- " /* unfitted */\n",
666
- " color: var(--sklearn-color-unfitted-level-1);\n",
667
- " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
668
- "}\n",
669
- "\n",
670
- "#sk-container-id-2 a.estimator_doc_link.fitted {\n",
671
- " /* fitted */\n",
672
- " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
673
- " color: var(--sklearn-color-fitted-level-1);\n",
674
- "}\n",
675
- "\n",
676
- "/* On hover */\n",
677
- "#sk-container-id-2 a.estimator_doc_link:hover {\n",
678
- " /* unfitted */\n",
679
- " background-color: var(--sklearn-color-unfitted-level-3);\n",
680
- " color: var(--sklearn-color-background);\n",
681
- " text-decoration: none;\n",
682
- "}\n",
683
- "\n",
684
- "#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
685
- " /* fitted */\n",
686
- " background-color: var(--sklearn-color-fitted-level-3);\n",
687
- "}\n",
688
- "</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" checked><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div>"
689
- ],
690
- "text/plain": [
691
- "LogisticRegression()"
692
- ]
693
- },
694
- "execution_count": 70,
695
- "metadata": {},
696
- "output_type": "execute_result"
697
- }
698
- ],
699
- "source": [
700
- "from sklearn.linear_model import LogisticRegression # Import LogisticRegression\n",
701
- "\n",
702
- "# Train a logistic regression model\n",
703
- "model = LogisticRegression()\n",
704
- "model.fit(X_train_tfidf, y_train)"
705
- ]
706
- },
707
- {
708
- "cell_type": "code",
709
- "execution_count": 71,
710
- "metadata": {},
711
- "outputs": [],
712
- "source": [
713
- "# Make predictions on the test set\n",
714
- "y_pred = model.predict(X_test_tfidf)"
715
- ]
716
- },
717
- {
718
- "cell_type": "code",
719
- "execution_count": 72,
720
- "metadata": {},
721
- "outputs": [
722
- {
723
- "name": "stdout",
724
- "output_type": "stream",
725
- "text": [
726
- "Accuracy: 0.9887640449438202\n",
727
- "Classification Report:\n",
728
- " precision recall f1-score support\n",
729
- "\n",
730
- " business 0.97 1.00 0.99 103\n",
731
- "entertainment 1.00 0.98 0.99 84\n",
732
- " politics 0.98 0.99 0.98 80\n",
733
- " sport 1.00 0.99 0.99 98\n",
734
- " tech 1.00 0.99 0.99 80\n",
735
- "\n",
736
- " accuracy 0.99 445\n",
737
- " macro avg 0.99 0.99 0.99 445\n",
738
- " weighted avg 0.99 0.99 0.99 445\n",
739
- "\n"
740
- ]
741
- }
742
- ],
743
- "source": [
744
- "from sklearn.metrics import accuracy_score, classification_report # Import evaluation metrics\n",
745
- "\n",
746
- "# Evaluate the model\n",
747
- "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
748
- "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n"
749
- ]
750
- },
751
- {
752
- "cell_type": "code",
753
- "execution_count": 73,
754
- "metadata": {},
755
- "outputs": [
756
- {
757
- "name": "stdout",
758
- "output_type": "stream",
759
- "text": [
760
- "confusion_matrix:\n",
761
- " [[103 0 0 0 0]\n",
762
- " [ 0 82 2 0 0]\n",
763
- " [ 1 0 79 0 0]\n",
764
- " [ 1 0 0 97 0]\n",
765
- " [ 1 0 0 0 79]]\n"
766
- ]
767
- }
768
- ],
769
- "source": [
770
- "from sklearn.metrics import confusion_matrix\n",
771
- "print(\"confusion_matrix:\\n\", confusion_matrix(y_test, y_pred))\n"
772
- ]
773
- },
774
- {
775
- "cell_type": "code",
776
- "execution_count": 74,
777
- "metadata": {},
778
- "outputs": [
779
- {
780
- "data": {
781
- "text/plain": [
782
- "['model.pkl']"
783
- ]
784
- },
785
- "execution_count": 74,
786
- "metadata": {},
787
- "output_type": "execute_result"
788
- }
789
- ],
790
- "source": [
791
- "import joblib\n",
792
- "joblib.dump(model,'model.pkl')"
793
- ]
794
- },
795
- {
796
- "cell_type": "code",
797
- "execution_count": 75,
798
- "metadata": {},
799
- "outputs": [
800
- {
801
- "data": {
802
- "text/plain": [
803
- "['vectorizer.pkl']"
804
- ]
805
- },
806
- "execution_count": 75,
807
- "metadata": {},
808
- "output_type": "execute_result"
809
- }
810
- ],
811
- "source": [
812
- "import joblib\n",
813
- "joblib.dump(vectorizer,'vectorizer.pkl')"
814
- ]
815
- },
816
- {
817
- "cell_type": "code",
818
- "execution_count": null,
819
- "metadata": {},
820
- "outputs": [],
821
- "source": []
822
- }
823
- ],
824
- "metadata": {
825
- "kernelspec": {
826
- "display_name": "base",
827
- "language": "python",
828
- "name": "python3"
829
- },
830
- "language_info": {
831
- "codemirror_mode": {
832
- "name": "ipython",
833
- "version": 3
834
- },
835
- "file_extension": ".py",
836
- "mimetype": "text/x-python",
837
- "name": "python",
838
- "nbconvert_exporter": "python",
839
- "pygments_lexer": "ipython3",
840
- "version": "3.10.9"
841
- }
842
- },
843
- "nbformat": 4,
844
- "nbformat_minor": 2
845
- }