shujath000 commited on
Commit
a6b51e7
·
verified ·
1 Parent(s): 274a597

Upload 5 files

Browse files
Complete_EDA_and_ML_training_code.ipynb ADDED
@@ -0,0 +1,1532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "704c8595-1f9b-4511-85d1-398a489721c6",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Data Collection"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 100,
14
+ "id": "adac7be9-df90-4e81-a49a-28056ef18737",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import pandas as pd"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 101,
24
+ "id": "ce25a764-8b1c-41c5-87db-4a68d8aad590",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "df1=pd.read_csv(r\"C:\\Users\\sss\\Desktop\\datas_titles.csv\",usecols=[\"id\",\"title\"])"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 102,
34
+ "id": "356af908-968c-45d6-86e7-5fd70154e80a",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "df2=pd.read_csv(r\"C:\\Users\\sss\\Desktop\\datas_tags.csv\",usecols=[\"id\",\"tags\"])"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 103,
44
+ "id": "196ecc83-1274-4769-ad5d-27997a5177bd",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "df3=pd.read_csv(r\"C:\\Users\\sss\\Documents\\datas_urls_questions.csv\",usecols=[\"id\",\"questions_url\"])"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 104,
54
+ "id": "f0b50f03-3c5b-45d2-b7d4-bc16dd1931cb",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "df_merged=pd.merge(df1,df2,on=\"id\")"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 105,
64
+ "id": "45b40742-759a-4a5f-9c45-748a79ac67e3",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "final_merged=pd.merge(df_merged,df3,on=\"id\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 106,
74
+ "id": "0b7b6646-04a2-439b-84f1-1ac34fa7c66b",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "final_merged.to_csv(r\"C:\\Users\\sss\\Documents\\final_dataset.csv\")"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "markdown",
83
+ "id": "53974e55-90bf-4aee-a54e-784481332ba1",
84
+ "metadata": {},
85
+ "source": [
86
+ "### Data Cleaning"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 107,
92
+ "id": "bf6765e0-bea0-4491-a8c7-b29c3c00f0e2",
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "df=final_merged"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 447,
102
+ "id": "ab026a7d-8628-4f05-8217-92033609e7e3",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "import nltk\n",
107
+ "import numpy as np\n",
108
+ "from nltk.tokenize import sent_tokenize as s, word_tokenize as w\n",
109
+ "from nltk.corpus import stopwords as stp\n",
110
+ "import string as st\n",
111
+ "import autocorrect\n",
112
+ "from autocorrect import Speller as spp\n",
113
+ "from nltk.stem import WordNetLemmatizer as wl\n",
114
+ "from nltk.corpus import wordnet\n",
115
+ "from nltk import pos_tag\n",
116
+ "from sklearn.pipeline import Pipeline\n",
117
+ "from sklearn.preprocessing import FunctionTransformer,MultiLabelBinarizer\n",
118
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
119
+ "import ast\n",
120
+ "from sklearn.multioutput import MultiOutputClassifier\n",
121
+ "from sklearn.linear_model import LogisticRegression\n",
122
+ "from sklearn.ensemble import BaggingClassifier\n",
123
+ "from sklearn.metrics import accuracy_score,hamming_loss\n",
124
+ "import joblib"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 109,
130
+ "id": "b582c693-c536-4560-87c3-b1a89b7e842e",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "df[\"tags\"]=df[\"tags\"].apply(ast.literal_eval)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 110,
140
+ "id": "4860d2f8-fa39-4b19-b1e6-b4010665851a",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "[nltk_data] Downloading package wordnet to\n",
148
+ "[nltk_data] C:\\Users\\sss\\AppData\\Roaming\\nltk_data...\n",
149
+ "[nltk_data] Package wordnet is already up-to-date!\n"
150
+ ]
151
+ },
152
+ {
153
+ "data": {
154
+ "text/plain": [
155
+ "True"
156
+ ]
157
+ },
158
+ "execution_count": 110,
159
+ "metadata": {},
160
+ "output_type": "execute_result"
161
+ }
162
+ ],
163
+ "source": [
164
+ "nltk.download(\"wordnet\")"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 111,
170
+ "id": "31960d58-4eba-4a4a-a22c-8537123161bc",
171
+ "metadata": {},
172
+ "outputs": [
173
+ {
174
+ "name": "stderr",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "[nltk_data] Downloading package punkt to\n",
178
+ "[nltk_data] C:\\Users\\sss\\AppData\\Roaming\\nltk_data...\n",
179
+ "[nltk_data] Package punkt is already up-to-date!\n",
180
+ "[nltk_data] Downloading package stopwords to\n",
181
+ "[nltk_data] C:\\Users\\sss\\AppData\\Roaming\\nltk_data...\n",
182
+ "[nltk_data] Package stopwords is already up-to-date!\n",
183
+ "[nltk_data] Downloading package wordnet to\n",
184
+ "[nltk_data] C:\\Users\\sss\\AppData\\Roaming\\nltk_data...\n",
185
+ "[nltk_data] Package wordnet is already up-to-date!\n",
186
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
187
+ "[nltk_data] C:\\Users\\sss\\AppData\\Roaming\\nltk_data...\n",
188
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
189
+ "[nltk_data] date!\n"
190
+ ]
191
+ },
192
+ {
193
+ "data": {
194
+ "text/plain": [
195
+ "True"
196
+ ]
197
+ },
198
+ "execution_count": 111,
199
+ "metadata": {},
200
+ "output_type": "execute_result"
201
+ }
202
+ ],
203
+ "source": [
204
+ "nltk.download('punkt')\n",
205
+ "nltk.download('stopwords')\n",
206
+ "nltk.download(\"wordnet\")\n",
207
+ "nltk.download('averaged_perceptron_tagger')"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 112,
213
+ "id": "df4a80b5-d935-4653-a363-48a8f2b1fd9e",
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "def sahi_karneka_function(x):\n",
218
+ " nouns=[]\n",
219
+ " li=[]\n",
220
+ " lem=wl()\n",
221
+ " l=s(x) \n",
222
+ " for i in l:\n",
223
+ " d=w(i.lower())\n",
224
+ " for k in d:\n",
225
+ " li.append(k)\n",
226
+ " lw=len(li)\n",
227
+ " j=0\n",
228
+ " while j<lw:\n",
229
+ " if li[j] in st.punctuation:\n",
230
+ " li.remove(li[j])\n",
231
+ " lw=len(li)\n",
232
+ " j=0\n",
233
+ " elif li[j] in stp.words(\"english\"):\n",
234
+ " li.remove(li[j])\n",
235
+ " lw=len(li)\n",
236
+ " j=0\n",
237
+ " else:\n",
238
+ " j=j+1\n",
239
+ " tags=pos_tag(li)\n",
240
+ " for word,tag in tags:\n",
241
+ " if tag.startswith(\"NN\") or tag.startswith(\"V\"):\n",
242
+ " nouns.append(word)\n",
243
+ " semi_final_words=[lem.lemmatize(m,pos=\"n\") if tagg.startswith(\"NN\") else lem.lemmatize(m,pos=\"v\") for m,tagg in pos_tag(nouns)]\n",
244
+ " final_sentence=\" \".join(semi_final_words)\n",
245
+ " return final_sentence"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 113,
251
+ "id": "96d278fa-3eb2-4751-909b-ed5a04fdd484",
252
+ "metadata": {},
253
+ "outputs": [],
254
+ "source": [
255
+ "df[\"title\"]=df[\"title\"].apply(sahi_karneka_function)"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 114,
261
+ "id": "c493c122-f90f-47d1-98d2-508c3b061647",
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "def tag_mein_repeat_nikalna(x):\n",
266
+ " row=x\n",
267
+ " i=0\n",
268
+ " l=len(row)\n",
269
+ " while i<l:\n",
270
+ " count=row.count(row[i])\n",
271
+ " if count>1:\n",
272
+ " rep=count-1\n",
273
+ " j=0\n",
274
+ " while j<rep:\n",
275
+ " row.remove(row[i])\n",
276
+ " j=j+1\n",
277
+ " i=0\n",
278
+ " l=len(row)\n",
279
+ " else:\n",
280
+ " i=i+1\n",
281
+ " return row"
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "execution_count": 115,
287
+ "id": "e9e2aea9-4e0b-4c9a-a6f2-856f3b66221e",
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": [
291
+ "df[\"tags\"]=df[\"tags\"].apply( tag_mein_repeat_nikalna)"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 164,
297
+ "id": "7f870471-0ca1-457d-9363-622c20d7363b",
298
+ "metadata": {},
299
+ "outputs": [
300
+ {
301
+ "data": {
302
+ "text/plain": [
303
+ "'https://stackoverflow.com//questions/79689975/how-to-calculate-hierarchical-aggregates-of-a-dataframe-r'"
304
+ ]
305
+ },
306
+ "execution_count": 164,
307
+ "metadata": {},
308
+ "output_type": "execute_result"
309
+ }
310
+ ],
311
+ "source": [
312
+ "df[\"questions_url\"][0]"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "markdown",
317
+ "id": "3d974fc3-3a35-4293-946d-6cdd95e61b54",
318
+ "metadata": {},
319
+ "source": [
320
+ "### model selection and evaluation "
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": 371,
326
+ "id": "d5eb6a6e-b39c-4a44-8166-871b75fdd5a8",
327
+ "metadata": {},
328
+ "outputs": [],
329
+ "source": [
330
+ "x=df[\"title\"]\n",
331
+ "y1=df[\"tags\"]"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": 372,
337
+ "id": "ecef232d-80cb-432a-9093-bbba672576da",
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "ml=MultiLabelBinarizer()\n",
342
+ "tfidf=TfidfVectorizer()\n",
343
+ "x_train_tfidf=tfidf.fit_transform(x)\n",
344
+ "x_test_tfidf=tfidf.transform(x[0:1])\n",
345
+ "y_train_encoded=ml.fit_transform(y1)\n",
346
+ "y_encoded_test=ml.transform(y1[0:1])"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 373,
352
+ "id": "5e41ec78-9cec-4b9f-a490-c936f7c3d015",
353
+ "metadata": {},
354
+ "outputs": [],
355
+ "source": [
356
+ "log=LogisticRegression(max_iter=1000,class_weight='balanced')"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": 374,
362
+ "id": "3c97db28-a318-46bb-a8c6-e2a2843e4d94",
363
+ "metadata": {},
364
+ "outputs": [],
365
+ "source": [
366
+ "model=MultiOutputClassifier(log)"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": 375,
372
+ "id": "df6714e6-b38c-4eac-b068-e0bd50061fd5",
373
+ "metadata": {},
374
+ "outputs": [
375
+ {
376
+ "data": {
377
+ "text/html": [
378
+ "<style>#sk-container-id-9 {\n",
379
+ " /* Definition of color scheme common for light and dark mode */\n",
380
+ " --sklearn-color-text: #000;\n",
381
+ " --sklearn-color-text-muted: #666;\n",
382
+ " --sklearn-color-line: gray;\n",
383
+ " /* Definition of color scheme for unfitted estimators */\n",
384
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
385
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
386
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
387
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
388
+ " /* Definition of color scheme for fitted estimators */\n",
389
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
390
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
391
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
392
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
393
+ "\n",
394
+ " /* Specific color for light theme */\n",
395
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
396
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
397
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
398
+ " --sklearn-color-icon: #696969;\n",
399
+ "\n",
400
+ " @media (prefers-color-scheme: dark) {\n",
401
+ " /* Redefinition of color scheme for dark theme */\n",
402
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
403
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
404
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
405
+ " --sklearn-color-icon: #878787;\n",
406
+ " }\n",
407
+ "}\n",
408
+ "\n",
409
+ "#sk-container-id-9 {\n",
410
+ " color: var(--sklearn-color-text);\n",
411
+ "}\n",
412
+ "\n",
413
+ "#sk-container-id-9 pre {\n",
414
+ " padding: 0;\n",
415
+ "}\n",
416
+ "\n",
417
+ "#sk-container-id-9 input.sk-hidden--visually {\n",
418
+ " border: 0;\n",
419
+ " clip: rect(1px 1px 1px 1px);\n",
420
+ " clip: rect(1px, 1px, 1px, 1px);\n",
421
+ " height: 1px;\n",
422
+ " margin: -1px;\n",
423
+ " overflow: hidden;\n",
424
+ " padding: 0;\n",
425
+ " position: absolute;\n",
426
+ " width: 1px;\n",
427
+ "}\n",
428
+ "\n",
429
+ "#sk-container-id-9 div.sk-dashed-wrapped {\n",
430
+ " border: 1px dashed var(--sklearn-color-line);\n",
431
+ " margin: 0 0.4em 0.5em 0.4em;\n",
432
+ " box-sizing: border-box;\n",
433
+ " padding-bottom: 0.4em;\n",
434
+ " background-color: var(--sklearn-color-background);\n",
435
+ "}\n",
436
+ "\n",
437
+ "#sk-container-id-9 div.sk-container {\n",
438
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
439
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
440
+ " so we also need the `!important` here to be able to override the\n",
441
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
442
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
443
+ " display: inline-block !important;\n",
444
+ " position: relative;\n",
445
+ "}\n",
446
+ "\n",
447
+ "#sk-container-id-9 div.sk-text-repr-fallback {\n",
448
+ " display: none;\n",
449
+ "}\n",
450
+ "\n",
451
+ "div.sk-parallel-item,\n",
452
+ "div.sk-serial,\n",
453
+ "div.sk-item {\n",
454
+ " /* draw centered vertical line to link estimators */\n",
455
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
456
+ " background-size: 2px 100%;\n",
457
+ " background-repeat: no-repeat;\n",
458
+ " background-position: center center;\n",
459
+ "}\n",
460
+ "\n",
461
+ "/* Parallel-specific style estimator block */\n",
462
+ "\n",
463
+ "#sk-container-id-9 div.sk-parallel-item::after {\n",
464
+ " content: \"\";\n",
465
+ " width: 100%;\n",
466
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
467
+ " flex-grow: 1;\n",
468
+ "}\n",
469
+ "\n",
470
+ "#sk-container-id-9 div.sk-parallel {\n",
471
+ " display: flex;\n",
472
+ " align-items: stretch;\n",
473
+ " justify-content: center;\n",
474
+ " background-color: var(--sklearn-color-background);\n",
475
+ " position: relative;\n",
476
+ "}\n",
477
+ "\n",
478
+ "#sk-container-id-9 div.sk-parallel-item {\n",
479
+ " display: flex;\n",
480
+ " flex-direction: column;\n",
481
+ "}\n",
482
+ "\n",
483
+ "#sk-container-id-9 div.sk-parallel-item:first-child::after {\n",
484
+ " align-self: flex-end;\n",
485
+ " width: 50%;\n",
486
+ "}\n",
487
+ "\n",
488
+ "#sk-container-id-9 div.sk-parallel-item:last-child::after {\n",
489
+ " align-self: flex-start;\n",
490
+ " width: 50%;\n",
491
+ "}\n",
492
+ "\n",
493
+ "#sk-container-id-9 div.sk-parallel-item:only-child::after {\n",
494
+ " width: 0;\n",
495
+ "}\n",
496
+ "\n",
497
+ "/* Serial-specific style estimator block */\n",
498
+ "\n",
499
+ "#sk-container-id-9 div.sk-serial {\n",
500
+ " display: flex;\n",
501
+ " flex-direction: column;\n",
502
+ " align-items: center;\n",
503
+ " background-color: var(--sklearn-color-background);\n",
504
+ " padding-right: 1em;\n",
505
+ " padding-left: 1em;\n",
506
+ "}\n",
507
+ "\n",
508
+ "\n",
509
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
510
+ "clickable and can be expanded/collapsed.\n",
511
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
512
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
513
+ "*/\n",
514
+ "\n",
515
+ "/* Pipeline and ColumnTransformer style (default) */\n",
516
+ "\n",
517
+ "#sk-container-id-9 div.sk-toggleable {\n",
518
+ " /* Default theme specific background. It is overwritten whether we have a\n",
519
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
520
+ " background-color: var(--sklearn-color-background);\n",
521
+ "}\n",
522
+ "\n",
523
+ "/* Toggleable label */\n",
524
+ "#sk-container-id-9 label.sk-toggleable__label {\n",
525
+ " cursor: pointer;\n",
526
+ " display: flex;\n",
527
+ " width: 100%;\n",
528
+ " margin-bottom: 0;\n",
529
+ " padding: 0.5em;\n",
530
+ " box-sizing: border-box;\n",
531
+ " text-align: center;\n",
532
+ " align-items: start;\n",
533
+ " justify-content: space-between;\n",
534
+ " gap: 0.5em;\n",
535
+ "}\n",
536
+ "\n",
537
+ "#sk-container-id-9 label.sk-toggleable__label .caption {\n",
538
+ " font-size: 0.6rem;\n",
539
+ " font-weight: lighter;\n",
540
+ " color: var(--sklearn-color-text-muted);\n",
541
+ "}\n",
542
+ "\n",
543
+ "#sk-container-id-9 label.sk-toggleable__label-arrow:before {\n",
544
+ " /* Arrow on the left of the label */\n",
545
+ " content: \"▸\";\n",
546
+ " float: left;\n",
547
+ " margin-right: 0.25em;\n",
548
+ " color: var(--sklearn-color-icon);\n",
549
+ "}\n",
550
+ "\n",
551
+ "#sk-container-id-9 label.sk-toggleable__label-arrow:hover:before {\n",
552
+ " color: var(--sklearn-color-text);\n",
553
+ "}\n",
554
+ "\n",
555
+ "/* Toggleable content - dropdown */\n",
556
+ "\n",
557
+ "#sk-container-id-9 div.sk-toggleable__content {\n",
558
+ " display: none;\n",
559
+ " text-align: left;\n",
560
+ " /* unfitted */\n",
561
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
562
+ "}\n",
563
+ "\n",
564
+ "#sk-container-id-9 div.sk-toggleable__content.fitted {\n",
565
+ " /* fitted */\n",
566
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
567
+ "}\n",
568
+ "\n",
569
+ "#sk-container-id-9 div.sk-toggleable__content pre {\n",
570
+ " margin: 0.2em;\n",
571
+ " border-radius: 0.25em;\n",
572
+ " color: var(--sklearn-color-text);\n",
573
+ " /* unfitted */\n",
574
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
575
+ "}\n",
576
+ "\n",
577
+ "#sk-container-id-9 div.sk-toggleable__content.fitted pre {\n",
578
+ " /* unfitted */\n",
579
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
580
+ "}\n",
581
+ "\n",
582
+ "#sk-container-id-9 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
583
+ " /* Expand drop-down */\n",
584
+ " display: block;\n",
585
+ " width: 100%;\n",
586
+ " overflow: visible;\n",
587
+ "}\n",
588
+ "\n",
589
+ "#sk-container-id-9 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
590
+ " content: \"▾\";\n",
591
+ "}\n",
592
+ "\n",
593
+ "/* Pipeline/ColumnTransformer-specific style */\n",
594
+ "\n",
595
+ "#sk-container-id-9 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
596
+ " color: var(--sklearn-color-text);\n",
597
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
598
+ "}\n",
599
+ "\n",
600
+ "#sk-container-id-9 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
601
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
602
+ "}\n",
603
+ "\n",
604
+ "/* Estimator-specific style */\n",
605
+ "\n",
606
+ "/* Colorize estimator box */\n",
607
+ "#sk-container-id-9 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
608
+ " /* unfitted */\n",
609
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
610
+ "}\n",
611
+ "\n",
612
+ "#sk-container-id-9 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
613
+ " /* fitted */\n",
614
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
615
+ "}\n",
616
+ "\n",
617
+ "#sk-container-id-9 div.sk-label label.sk-toggleable__label,\n",
618
+ "#sk-container-id-9 div.sk-label label {\n",
619
+ " /* The background is the default theme color */\n",
620
+ " color: var(--sklearn-color-text-on-default-background);\n",
621
+ "}\n",
622
+ "\n",
623
+ "/* On hover, darken the color of the background */\n",
624
+ "#sk-container-id-9 div.sk-label:hover label.sk-toggleable__label {\n",
625
+ " color: var(--sklearn-color-text);\n",
626
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
627
+ "}\n",
628
+ "\n",
629
+ "/* Label box, darken color on hover, fitted */\n",
630
+ "#sk-container-id-9 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
631
+ " color: var(--sklearn-color-text);\n",
632
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
633
+ "}\n",
634
+ "\n",
635
+ "/* Estimator label */\n",
636
+ "\n",
637
+ "#sk-container-id-9 div.sk-label label {\n",
638
+ " font-family: monospace;\n",
639
+ " font-weight: bold;\n",
640
+ " display: inline-block;\n",
641
+ " line-height: 1.2em;\n",
642
+ "}\n",
643
+ "\n",
644
+ "#sk-container-id-9 div.sk-label-container {\n",
645
+ " text-align: center;\n",
646
+ "}\n",
647
+ "\n",
648
+ "/* Estimator-specific */\n",
649
+ "#sk-container-id-9 div.sk-estimator {\n",
650
+ " font-family: monospace;\n",
651
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
652
+ " border-radius: 0.25em;\n",
653
+ " box-sizing: border-box;\n",
654
+ " margin-bottom: 0.5em;\n",
655
+ " /* unfitted */\n",
656
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
657
+ "}\n",
658
+ "\n",
659
+ "#sk-container-id-9 div.sk-estimator.fitted {\n",
660
+ " /* fitted */\n",
661
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
662
+ "}\n",
663
+ "\n",
664
+ "/* on hover */\n",
665
+ "#sk-container-id-9 div.sk-estimator:hover {\n",
666
+ " /* unfitted */\n",
667
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
668
+ "}\n",
669
+ "\n",
670
+ "#sk-container-id-9 div.sk-estimator.fitted:hover {\n",
671
+ " /* fitted */\n",
672
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
673
+ "}\n",
674
+ "\n",
675
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
676
+ "\n",
677
+ "/* Common style for \"i\" and \"?\" */\n",
678
+ "\n",
679
+ ".sk-estimator-doc-link,\n",
680
+ "a:link.sk-estimator-doc-link,\n",
681
+ "a:visited.sk-estimator-doc-link {\n",
682
+ " float: right;\n",
683
+ " font-size: smaller;\n",
684
+ " line-height: 1em;\n",
685
+ " font-family: monospace;\n",
686
+ " background-color: var(--sklearn-color-background);\n",
687
+ " border-radius: 1em;\n",
688
+ " height: 1em;\n",
689
+ " width: 1em;\n",
690
+ " text-decoration: none !important;\n",
691
+ " margin-left: 0.5em;\n",
692
+ " text-align: center;\n",
693
+ " /* unfitted */\n",
694
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
695
+ " color: var(--sklearn-color-unfitted-level-1);\n",
696
+ "}\n",
697
+ "\n",
698
+ ".sk-estimator-doc-link.fitted,\n",
699
+ "a:link.sk-estimator-doc-link.fitted,\n",
700
+ "a:visited.sk-estimator-doc-link.fitted {\n",
701
+ " /* fitted */\n",
702
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
703
+ " color: var(--sklearn-color-fitted-level-1);\n",
704
+ "}\n",
705
+ "\n",
706
+ "/* On hover */\n",
707
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
708
+ ".sk-estimator-doc-link:hover,\n",
709
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
710
+ ".sk-estimator-doc-link:hover {\n",
711
+ " /* unfitted */\n",
712
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
713
+ " color: var(--sklearn-color-background);\n",
714
+ " text-decoration: none;\n",
715
+ "}\n",
716
+ "\n",
717
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
718
+ ".sk-estimator-doc-link.fitted:hover,\n",
719
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
720
+ ".sk-estimator-doc-link.fitted:hover {\n",
721
+ " /* fitted */\n",
722
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
723
+ " color: var(--sklearn-color-background);\n",
724
+ " text-decoration: none;\n",
725
+ "}\n",
726
+ "\n",
727
+ "/* Span, style for the box shown on hovering the info icon */\n",
728
+ ".sk-estimator-doc-link span {\n",
729
+ " display: none;\n",
730
+ " z-index: 9999;\n",
731
+ " position: relative;\n",
732
+ " font-weight: normal;\n",
733
+ " right: .2ex;\n",
734
+ " padding: .5ex;\n",
735
+ " margin: .5ex;\n",
736
+ " width: min-content;\n",
737
+ " min-width: 20ex;\n",
738
+ " max-width: 50ex;\n",
739
+ " color: var(--sklearn-color-text);\n",
740
+ " box-shadow: 2pt 2pt 4pt #999;\n",
741
+ " /* unfitted */\n",
742
+ " background: var(--sklearn-color-unfitted-level-0);\n",
743
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
744
+ "}\n",
745
+ "\n",
746
+ ".sk-estimator-doc-link.fitted span {\n",
747
+ " /* fitted */\n",
748
+ " background: var(--sklearn-color-fitted-level-0);\n",
749
+ " border: var(--sklearn-color-fitted-level-3);\n",
750
+ "}\n",
751
+ "\n",
752
+ ".sk-estimator-doc-link:hover span {\n",
753
+ " display: block;\n",
754
+ "}\n",
755
+ "\n",
756
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
757
+ "\n",
758
+ "#sk-container-id-9 a.estimator_doc_link {\n",
759
+ " float: right;\n",
760
+ " font-size: 1rem;\n",
761
+ " line-height: 1em;\n",
762
+ " font-family: monospace;\n",
763
+ " background-color: var(--sklearn-color-background);\n",
764
+ " border-radius: 1rem;\n",
765
+ " height: 1rem;\n",
766
+ " width: 1rem;\n",
767
+ " text-decoration: none;\n",
768
+ " /* unfitted */\n",
769
+ " color: var(--sklearn-color-unfitted-level-1);\n",
770
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
771
+ "}\n",
772
+ "\n",
773
+ "#sk-container-id-9 a.estimator_doc_link.fitted {\n",
774
+ " /* fitted */\n",
775
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
776
+ " color: var(--sklearn-color-fitted-level-1);\n",
777
+ "}\n",
778
+ "\n",
779
+ "/* On hover */\n",
780
+ "#sk-container-id-9 a.estimator_doc_link:hover {\n",
781
+ " /* unfitted */\n",
782
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
783
+ " color: var(--sklearn-color-background);\n",
784
+ " text-decoration: none;\n",
785
+ "}\n",
786
+ "\n",
787
+ "#sk-container-id-9 a.estimator_doc_link.fitted:hover {\n",
788
+ " /* fitted */\n",
789
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
790
+ "}\n",
791
+ "\n",
792
+ ".estimator-table summary {\n",
793
+ " padding: .5rem;\n",
794
+ " font-family: monospace;\n",
795
+ " cursor: pointer;\n",
796
+ "}\n",
797
+ "\n",
798
+ ".estimator-table details[open] {\n",
799
+ " padding-left: 0.1rem;\n",
800
+ " padding-right: 0.1rem;\n",
801
+ " padding-bottom: 0.3rem;\n",
802
+ "}\n",
803
+ "\n",
804
+ ".estimator-table .parameters-table {\n",
805
+ " margin-left: auto !important;\n",
806
+ " margin-right: auto !important;\n",
807
+ "}\n",
808
+ "\n",
809
+ ".estimator-table .parameters-table tr:nth-child(odd) {\n",
810
+ " background-color: #fff;\n",
811
+ "}\n",
812
+ "\n",
813
+ ".estimator-table .parameters-table tr:nth-child(even) {\n",
814
+ " background-color: #f6f6f6;\n",
815
+ "}\n",
816
+ "\n",
817
+ ".estimator-table .parameters-table tr:hover {\n",
818
+ " background-color: #e0e0e0;\n",
819
+ "}\n",
820
+ "\n",
821
+ ".estimator-table table td {\n",
822
+ " border: 1px solid rgba(106, 105, 104, 0.232);\n",
823
+ "}\n",
824
+ "\n",
825
+ ".user-set td {\n",
826
+ " color:rgb(255, 94, 0);\n",
827
+ " text-align: left;\n",
828
+ "}\n",
829
+ "\n",
830
+ ".user-set td.value pre {\n",
831
+ " color:rgb(255, 94, 0) !important;\n",
832
+ " background-color: transparent !important;\n",
833
+ "}\n",
834
+ "\n",
835
+ ".default td {\n",
836
+ " color: black;\n",
837
+ " text-align: left;\n",
838
+ "}\n",
839
+ "\n",
840
+ ".user-set td i,\n",
841
+ ".default td i {\n",
842
+ " color: black;\n",
843
+ "}\n",
844
+ "\n",
845
+ ".copy-paste-icon {\n",
846
+ " background-image: url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCA0NDggNTEyIj48IS0tIUZvbnQgQXdlc29tZSBGcmVlIDYuNy4yIGJ5IEBmb250YXdlc29tZSAtIGh0dHBzOi8vZm9udGF3ZXNvbWUuY29tIExpY2Vuc2UgLSBodHRwczovL2ZvbnRhd2Vzb21lLmNvbS9saWNlbnNlL2ZyZWUgQ29weXJpZ2h0IDIwMjUgRm9udGljb25zLCBJbmMuLS0+PHBhdGggZD0iTTIwOCAwTDMzMi4xIDBjMTIuNyAwIDI0LjkgNS4xIDMzLjkgMTQuMWw2Ny45IDY3LjljOSA5IDE0LjEgMjEuMiAxNC4xIDMzLjlMNDQ4IDMzNmMwIDI2LjUtMjEuNSA0OC00OCA0OGwtMTkyIDBjLTI2LjUgMC00OC0yMS41LTQ4LTQ4bDAtMjg4YzAtMjYuNSAyMS41LTQ4IDQ4LTQ4ek00OCAxMjhsODAgMCAwIDY0LTY0IDAgMCAyNTYgMTkyIDAgMC0zMiA2NCAwIDAgNDhjMCAyNi41LTIxLjUgNDgtNDggNDhMNDggNTEyYy0yNi41IDAtNDgtMjEuNS00OC00OEwwIDE3NmMwLTI2LjUgMjEuNS00OCA0OC00OHoiLz48L3N2Zz4=);\n",
847
+ " background-repeat: no-repeat;\n",
848
+ " background-size: 14px 14px;\n",
849
+ " background-position: 0;\n",
850
+ " display: inline-block;\n",
851
+ " width: 14px;\n",
852
+ " height: 14px;\n",
853
+ " cursor: pointer;\n",
854
+ "}\n",
855
+ "</style><body><div id=\"sk-container-id-9\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultiOutputClassifier(estimator=LogisticRegression(class_weight=&#x27;balanced&#x27;,\n",
856
+ " max_iter=1000))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-25\" type=\"checkbox\" ><label for=\"sk-estimator-id-25\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>MultiOutputClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.7/modules/generated/sklearn.multioutput.MultiOutputClassifier.html\">?<span>Documentation for MultiOutputClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\" data-param-prefix=\"\">\n",
857
+ " <div class=\"estimator-table\">\n",
858
+ " <details>\n",
859
+ " <summary>Parameters</summary>\n",
860
+ " <table class=\"parameters-table\">\n",
861
+ " <tbody>\n",
862
+ " \n",
863
+ " <tr class=\"user-set\">\n",
864
+ " <td><i class=\"copy-paste-icon\"\n",
865
+ " onclick=\"copyToClipboard('estimator',\n",
866
+ " this.parentElement.nextElementSibling)\"\n",
867
+ " ></i></td>\n",
868
+ " <td class=\"param\">estimator&nbsp;</td>\n",
869
+ " <td class=\"value\">LogisticRegre...max_iter=1000)</td>\n",
870
+ " </tr>\n",
871
+ " \n",
872
+ "\n",
873
+ " <tr class=\"default\">\n",
874
+ " <td><i class=\"copy-paste-icon\"\n",
875
+ " onclick=\"copyToClipboard('n_jobs',\n",
876
+ " this.parentElement.nextElementSibling)\"\n",
877
+ " ></i></td>\n",
878
+ " <td class=\"param\">n_jobs&nbsp;</td>\n",
879
+ " <td class=\"value\">None</td>\n",
880
+ " </tr>\n",
881
+ " \n",
882
+ " </tbody>\n",
883
+ " </table>\n",
884
+ " </details>\n",
885
+ " </div>\n",
886
+ " </div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-26\" type=\"checkbox\" ><label for=\"sk-estimator-id-26\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>estimator: LogisticRegression</div></div></label><div class=\"sk-toggleable__content fitted\" data-param-prefix=\"estimator__\"><pre>LogisticRegression(class_weight=&#x27;balanced&#x27;, max_iter=1000)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-27\" type=\"checkbox\" ><label for=\"sk-estimator-id-27\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>LogisticRegression</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.7/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></div></label><div class=\"sk-toggleable__content fitted\" data-param-prefix=\"estimator__\">\n",
887
+ " <div class=\"estimator-table\">\n",
888
+ " <details>\n",
889
+ " <summary>Parameters</summary>\n",
890
+ " <table class=\"parameters-table\">\n",
891
+ " <tbody>\n",
892
+ " \n",
893
+ " <tr class=\"default\">\n",
894
+ " <td><i class=\"copy-paste-icon\"\n",
895
+ " onclick=\"copyToClipboard('penalty',\n",
896
+ " this.parentElement.nextElementSibling)\"\n",
897
+ " ></i></td>\n",
898
+ " <td class=\"param\">penalty&nbsp;</td>\n",
899
+ " <td class=\"value\">&#x27;l2&#x27;</td>\n",
900
+ " </tr>\n",
901
+ " \n",
902
+ "\n",
903
+ " <tr class=\"default\">\n",
904
+ " <td><i class=\"copy-paste-icon\"\n",
905
+ " onclick=\"copyToClipboard('dual',\n",
906
+ " this.parentElement.nextElementSibling)\"\n",
907
+ " ></i></td>\n",
908
+ " <td class=\"param\">dual&nbsp;</td>\n",
909
+ " <td class=\"value\">False</td>\n",
910
+ " </tr>\n",
911
+ " \n",
912
+ "\n",
913
+ " <tr class=\"default\">\n",
914
+ " <td><i class=\"copy-paste-icon\"\n",
915
+ " onclick=\"copyToClipboard('tol',\n",
916
+ " this.parentElement.nextElementSibling)\"\n",
917
+ " ></i></td>\n",
918
+ " <td class=\"param\">tol&nbsp;</td>\n",
919
+ " <td class=\"value\">0.0001</td>\n",
920
+ " </tr>\n",
921
+ " \n",
922
+ "\n",
923
+ " <tr class=\"default\">\n",
924
+ " <td><i class=\"copy-paste-icon\"\n",
925
+ " onclick=\"copyToClipboard('C',\n",
926
+ " this.parentElement.nextElementSibling)\"\n",
927
+ " ></i></td>\n",
928
+ " <td class=\"param\">C&nbsp;</td>\n",
929
+ " <td class=\"value\">1.0</td>\n",
930
+ " </tr>\n",
931
+ " \n",
932
+ "\n",
933
+ " <tr class=\"default\">\n",
934
+ " <td><i class=\"copy-paste-icon\"\n",
935
+ " onclick=\"copyToClipboard('fit_intercept',\n",
936
+ " this.parentElement.nextElementSibling)\"\n",
937
+ " ></i></td>\n",
938
+ " <td class=\"param\">fit_intercept&nbsp;</td>\n",
939
+ " <td class=\"value\">True</td>\n",
940
+ " </tr>\n",
941
+ " \n",
942
+ "\n",
943
+ " <tr class=\"default\">\n",
944
+ " <td><i class=\"copy-paste-icon\"\n",
945
+ " onclick=\"copyToClipboard('intercept_scaling',\n",
946
+ " this.parentElement.nextElementSibling)\"\n",
947
+ " ></i></td>\n",
948
+ " <td class=\"param\">intercept_scaling&nbsp;</td>\n",
949
+ " <td class=\"value\">1</td>\n",
950
+ " </tr>\n",
951
+ " \n",
952
+ "\n",
953
+ " <tr class=\"user-set\">\n",
954
+ " <td><i class=\"copy-paste-icon\"\n",
955
+ " onclick=\"copyToClipboard('class_weight',\n",
956
+ " this.parentElement.nextElementSibling)\"\n",
957
+ " ></i></td>\n",
958
+ " <td class=\"param\">class_weight&nbsp;</td>\n",
959
+ " <td class=\"value\">&#x27;balanced&#x27;</td>\n",
960
+ " </tr>\n",
961
+ " \n",
962
+ "\n",
963
+ " <tr class=\"default\">\n",
964
+ " <td><i class=\"copy-paste-icon\"\n",
965
+ " onclick=\"copyToClipboard('random_state',\n",
966
+ " this.parentElement.nextElementSibling)\"\n",
967
+ " ></i></td>\n",
968
+ " <td class=\"param\">random_state&nbsp;</td>\n",
969
+ " <td class=\"value\">None</td>\n",
970
+ " </tr>\n",
971
+ " \n",
972
+ "\n",
973
+ " <tr class=\"default\">\n",
974
+ " <td><i class=\"copy-paste-icon\"\n",
975
+ " onclick=\"copyToClipboard('solver',\n",
976
+ " this.parentElement.nextElementSibling)\"\n",
977
+ " ></i></td>\n",
978
+ " <td class=\"param\">solver&nbsp;</td>\n",
979
+ " <td class=\"value\">&#x27;lbfgs&#x27;</td>\n",
980
+ " </tr>\n",
981
+ " \n",
982
+ "\n",
983
+ " <tr class=\"user-set\">\n",
984
+ " <td><i class=\"copy-paste-icon\"\n",
985
+ " onclick=\"copyToClipboard('max_iter',\n",
986
+ " this.parentElement.nextElementSibling)\"\n",
987
+ " ></i></td>\n",
988
+ " <td class=\"param\">max_iter&nbsp;</td>\n",
989
+ " <td class=\"value\">1000</td>\n",
990
+ " </tr>\n",
991
+ " \n",
992
+ "\n",
993
+ " <tr class=\"default\">\n",
994
+ " <td><i class=\"copy-paste-icon\"\n",
995
+ " onclick=\"copyToClipboard('multi_class',\n",
996
+ " this.parentElement.nextElementSibling)\"\n",
997
+ " ></i></td>\n",
998
+ " <td class=\"param\">multi_class&nbsp;</td>\n",
999
+ " <td class=\"value\">&#x27;deprecated&#x27;</td>\n",
1000
+ " </tr>\n",
1001
+ " \n",
1002
+ "\n",
1003
+ " <tr class=\"default\">\n",
1004
+ " <td><i class=\"copy-paste-icon\"\n",
1005
+ " onclick=\"copyToClipboard('verbose',\n",
1006
+ " this.parentElement.nextElementSibling)\"\n",
1007
+ " ></i></td>\n",
1008
+ " <td class=\"param\">verbose&nbsp;</td>\n",
1009
+ " <td class=\"value\">0</td>\n",
1010
+ " </tr>\n",
1011
+ " \n",
1012
+ "\n",
1013
+ " <tr class=\"default\">\n",
1014
+ " <td><i class=\"copy-paste-icon\"\n",
1015
+ " onclick=\"copyToClipboard('warm_start',\n",
1016
+ " this.parentElement.nextElementSibling)\"\n",
1017
+ " ></i></td>\n",
1018
+ " <td class=\"param\">warm_start&nbsp;</td>\n",
1019
+ " <td class=\"value\">False</td>\n",
1020
+ " </tr>\n",
1021
+ " \n",
1022
+ "\n",
1023
+ " <tr class=\"default\">\n",
1024
+ " <td><i class=\"copy-paste-icon\"\n",
1025
+ " onclick=\"copyToClipboard('n_jobs',\n",
1026
+ " this.parentElement.nextElementSibling)\"\n",
1027
+ " ></i></td>\n",
1028
+ " <td class=\"param\">n_jobs&nbsp;</td>\n",
1029
+ " <td class=\"value\">None</td>\n",
1030
+ " </tr>\n",
1031
+ " \n",
1032
+ "\n",
1033
+ " <tr class=\"default\">\n",
1034
+ " <td><i class=\"copy-paste-icon\"\n",
1035
+ " onclick=\"copyToClipboard('l1_ratio',\n",
1036
+ " this.parentElement.nextElementSibling)\"\n",
1037
+ " ></i></td>\n",
1038
+ " <td class=\"param\">l1_ratio&nbsp;</td>\n",
1039
+ " <td class=\"value\">None</td>\n",
1040
+ " </tr>\n",
1041
+ " \n",
1042
+ " </tbody>\n",
1043
+ " </table>\n",
1044
+ " </details>\n",
1045
+ " </div>\n",
1046
+ " </div></div></div></div></div></div></div></div></div></div><script>function copyToClipboard(text, element) {\n",
1047
+ " // Get the parameter prefix from the closest toggleable content\n",
1048
+ " const toggleableContent = element.closest('.sk-toggleable__content');\n",
1049
+ " const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
1050
+ " const fullParamName = paramPrefix ? `${paramPrefix}${text}` : text;\n",
1051
+ "\n",
1052
+ " const originalStyle = element.style;\n",
1053
+ " const computedStyle = window.getComputedStyle(element);\n",
1054
+ " const originalWidth = computedStyle.width;\n",
1055
+ " const originalHTML = element.innerHTML.replace('Copied!', '');\n",
1056
+ "\n",
1057
+ " navigator.clipboard.writeText(fullParamName)\n",
1058
+ " .then(() => {\n",
1059
+ " element.style.width = originalWidth;\n",
1060
+ " element.style.color = 'green';\n",
1061
+ " element.innerHTML = \"Copied!\";\n",
1062
+ "\n",
1063
+ " setTimeout(() => {\n",
1064
+ " element.innerHTML = originalHTML;\n",
1065
+ " element.style = originalStyle;\n",
1066
+ " }, 2000);\n",
1067
+ " })\n",
1068
+ " .catch(err => {\n",
1069
+ " console.error('Failed to copy:', err);\n",
1070
+ " element.style.color = 'red';\n",
1071
+ " element.innerHTML = \"Failed!\";\n",
1072
+ " setTimeout(() => {\n",
1073
+ " element.innerHTML = originalHTML;\n",
1074
+ " element.style = originalStyle;\n",
1075
+ " }, 2000);\n",
1076
+ " });\n",
1077
+ " return false;\n",
1078
+ "}\n",
1079
+ "\n",
1080
+ "document.querySelectorAll('.fa-regular.fa-copy').forEach(function(element) {\n",
1081
+ " const toggleableContent = element.closest('.sk-toggleable__content');\n",
1082
+ " const paramPrefix = toggleableContent ? toggleableContent.dataset.paramPrefix : '';\n",
1083
+ " const paramName = element.parentElement.nextElementSibling.textContent.trim();\n",
1084
+ " const fullParamName = paramPrefix ? `${paramPrefix}${paramName}` : paramName;\n",
1085
+ "\n",
1086
+ " element.setAttribute('title', fullParamName);\n",
1087
+ "});\n",
1088
+ "</script></body>"
1089
+ ],
1090
+ "text/plain": [
1091
+ "MultiOutputClassifier(estimator=LogisticRegression(class_weight='balanced',\n",
1092
+ " max_iter=1000))"
1093
+ ]
1094
+ },
1095
+ "execution_count": 375,
1096
+ "metadata": {},
1097
+ "output_type": "execute_result"
1098
+ }
1099
+ ],
1100
+ "source": [
1101
+ "model.fit(x_train_tfidf,y_train_encoded)"
1102
+ ]
1103
+ },
1104
+ {
1105
+ "cell_type": "markdown",
1106
+ "id": "266f92ec-f07f-42f3-a4fb-4ed84f59f453",
1107
+ "metadata": {},
1108
+ "source": [
1109
+ "### default threshold of probability of label greater then 0.5 is predicted by logistic regression"
1110
+ ]
1111
+ },
1112
+ {
1113
+ "cell_type": "code",
1114
+ "execution_count": 376,
1115
+ "id": "e22f91a1-8e4c-454f-b43e-c2b9067749ff",
1116
+ "metadata": {},
1117
+ "outputs": [],
1118
+ "source": [
1119
+ "y_pred_default_prob=model.predict(x_test_tfidf)"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 377,
1125
+ "id": "4f51fa10-8b99-4473-9655-fb81b1c71b65",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "text/plain": [
1131
+ "[('dataframe', 'pandas', 'python', 'r', 'spring-boot-2')]"
1132
+ ]
1133
+ },
1134
+ "execution_count": 377,
1135
+ "metadata": {},
1136
+ "output_type": "execute_result"
1137
+ }
1138
+ ],
1139
+ "source": [
1140
+ "ml.inverse_transform(y_pred_default_prob)"
1141
+ ]
1142
+ },
1143
+ {
1144
+ "cell_type": "code",
1145
+ "execution_count": 442,
1146
+ "id": "6a92a35c-a678-41c3-bc67-9cbf4f56906f",
1147
+ "metadata": {},
1148
+ "outputs": [
1149
+ {
1150
+ "data": {
1151
+ "text/plain": [
1152
+ "[('r',)]"
1153
+ ]
1154
+ },
1155
+ "execution_count": 442,
1156
+ "metadata": {},
1157
+ "output_type": "execute_result"
1158
+ }
1159
+ ],
1160
+ "source": [
1161
+ "ml.inverse_transform(y_encoded_test)"
1162
+ ]
1163
+ },
1164
+ {
1165
+ "cell_type": "code",
1166
+ "execution_count": 378,
1167
+ "id": "fe4ec6d6-17b6-4f4a-aab3-8a3fa7d86136",
1168
+ "metadata": {},
1169
+ "outputs": [
1170
+ {
1171
+ "name": "stdout",
1172
+ "output_type": "stream",
1173
+ "text": [
1174
+ "Hamming Loss: 0.0009823182711198428\n",
1175
+ "Accuracy Score: 0.0\n"
1176
+ ]
1177
+ }
1178
+ ],
1179
+ "source": [
1180
+ "print(\"Hamming Loss:\",hamming_loss(y_encoded_test,y_pred_default_prob))\n",
1181
+ "print(\"Accuracy Score:\",accuracy_score(y_encoded_test,y_pred_default_prob))"
1182
+ ]
1183
+ },
1184
+ {
1185
+ "cell_type": "code",
1186
+ "execution_count": 379,
1187
+ "id": "c114e272-e3a1-43cd-b638-025fc0286b46",
1188
+ "metadata": {},
1189
+ "outputs": [
1190
+ {
1191
+ "name": "stdout",
1192
+ "output_type": "stream",
1193
+ "text": [
1194
+ "F1 Score (micro): 0.3333333333333333\n",
1195
+ "F1 Score (macro): 0.0002455795677799607\n"
1196
+ ]
1197
+ }
1198
+ ],
1199
+ "source": [
1200
+ "print(\"F1 Score (micro):\", f1_score(y_encoded_test,y_pred_default_prob, average='micro',zero_division=0))\n",
1201
+ "print(\"F1 Score (macro):\", f1_score(y_encoded_test,y_pred_default_prob, average='macro',zero_division=0))"
1202
+ ]
1203
+ },
1204
+ {
1205
+ "cell_type": "markdown",
1206
+ "id": "86956b7a-f9ce-402a-9d5f-d1e91d54eb81",
1207
+ "metadata": {},
1208
+ "source": [
1209
+ "### I want to predict the label whose probability is greater the 0.75 so i filtered by threshold after getting all probabilities of labels"
1210
+ ]
1211
+ },
1212
+ {
1213
+ "cell_type": "code",
1214
+ "execution_count": 380,
1215
+ "id": "15ada427-5afa-4638-a938-ab35ab3a9d1b",
1216
+ "metadata": {},
1217
+ "outputs": [],
1218
+ "source": [
1219
+ "y_probs=model.predict_proba(x_test_tfidf)"
1220
+ ]
1221
+ },
1222
+ {
1223
+ "cell_type": "code",
1224
+ "execution_count": 381,
1225
+ "id": "afe9180d-65e7-414a-bf39-32260c205724",
1226
+ "metadata": {},
1227
+ "outputs": [],
1228
+ "source": [
1229
+ "threshold=0.75\n",
1230
+ "probs_column1=np.array([i[0:,1] for i in y_probs]).T\n",
1231
+ "y_pred_customized_prob=(probs_column1>threshold).astype(int)"
1232
+ ]
1233
+ },
1234
+ {
1235
+ "cell_type": "code",
1236
+ "execution_count": 441,
1237
+ "id": "14f950ff-2f62-4bfa-bd2a-c9e8ec8993b7",
1238
+ "metadata": {},
1239
+ "outputs": [
1240
+ {
1241
+ "data": {
1242
+ "text/plain": [
1243
+ "[('r',)]"
1244
+ ]
1245
+ },
1246
+ "execution_count": 441,
1247
+ "metadata": {},
1248
+ "output_type": "execute_result"
1249
+ }
1250
+ ],
1251
+ "source": [
1252
+ "ml.inverse_transform(y_pred_customized_prob)"
1253
+ ]
1254
+ },
1255
+ {
1256
+ "cell_type": "code",
1257
+ "execution_count": 444,
1258
+ "id": "5cd57d27-55db-41a9-a770-a9a5fd4fbf64",
1259
+ "metadata": {},
1260
+ "outputs": [
1261
+ {
1262
+ "data": {
1263
+ "text/plain": [
1264
+ "[('r',)]"
1265
+ ]
1266
+ },
1267
+ "execution_count": 444,
1268
+ "metadata": {},
1269
+ "output_type": "execute_result"
1270
+ }
1271
+ ],
1272
+ "source": [
1273
+ "ml.inverse_transform(y_encoded_test)"
1274
+ ]
1275
+ },
1276
+ {
1277
+ "cell_type": "code",
1278
+ "execution_count": 382,
1279
+ "id": "c87755ac-4682-44fa-8bae-2acfb550d867",
1280
+ "metadata": {},
1281
+ "outputs": [
1282
+ {
1283
+ "name": "stdout",
1284
+ "output_type": "stream",
1285
+ "text": [
1286
+ "Hamming Loss: 0.0\n",
1287
+ "Accuracy Score: 1.0\n"
1288
+ ]
1289
+ }
1290
+ ],
1291
+ "source": [
1292
+ "print(\"Hamming Loss:\",hamming_loss(y_encoded_test,y_pred_customized_prob))\n",
1293
+ "print(\"Accuracy Score:\",accuracy_score(y_encoded_test,y_pred_customized_prob))"
1294
+ ]
1295
+ },
1296
+ {
1297
+ "cell_type": "code",
1298
+ "execution_count": 383,
1299
+ "id": "12a25f36-3cf7-457d-9c4e-124a2d9e6447",
1300
+ "metadata": {},
1301
+ "outputs": [
1302
+ {
1303
+ "name": "stdout",
1304
+ "output_type": "stream",
1305
+ "text": [
1306
+ "F1 Score (micro): 1.0\n",
1307
+ "F1 Score (macro): 0.0002455795677799607\n"
1308
+ ]
1309
+ }
1310
+ ],
1311
+ "source": [
1312
+ "print(\"F1 Score (micro):\", f1_score(y_encoded_test,y_pred_customized_prob, average='micro',zero_division=0))\n",
1313
+ "print(\"F1 Score (macro):\", f1_score(y_encoded_test,y_pred_customized_prob, average='macro',zero_division=0))"
1314
+ ]
1315
+ },
1316
+ {
1317
+ "cell_type": "markdown",
1318
+ "id": "9b125ddb-0691-418a-970f-73660b751a24",
1319
+ "metadata": {},
1320
+ "source": [
1321
+ "### ONLY WHEN USE IN STREAMLIT FOR QUESTION body as x_test and predicting tag"
1322
+ ]
1323
+ },
1324
+ {
1325
+ "cell_type": "markdown",
1326
+ "id": "3d767df0-f1ff-4210-80c0-e310a4247e64",
1327
+ "metadata": {},
1328
+ "source": [
1329
+ "## Testing question body as x_test"
1330
+ ]
1331
+ },
1332
+ {
1333
+ "cell_type": "code",
1334
+ "execution_count": 392,
1335
+ "id": "cf5fb6b7-02e5-4abb-9b74-9c63acada17b",
1336
+ "metadata": {},
1337
+ "outputs": [
1338
+ {
1339
+ "data": {
1340
+ "text/plain": [
1341
+ "0 https://stackoverflow.com//questions/79689975/...\n",
1342
+ "Name: questions_url, dtype: object"
1343
+ ]
1344
+ },
1345
+ "execution_count": 392,
1346
+ "metadata": {},
1347
+ "output_type": "execute_result"
1348
+ }
1349
+ ],
1350
+ "source": [
1351
+ "df[\"questions_url\"][df[\"id\"]==79689975][0:]"
1352
+ ]
1353
+ },
1354
+ {
1355
+ "cell_type": "code",
1356
+ "execution_count": 415,
1357
+ "id": "4d4d8e85-f9bf-448b-a9f0-93d6eb8143b8",
1358
+ "metadata": {},
1359
+ "outputs": [],
1360
+ "source": [
1361
+ "y_testing=df[\"tags\"][df[\"id\"]==79689975]\n",
1362
+ "y_testing_encoded=m.transform(y_testing)"
1363
+ ]
1364
+ },
1365
+ {
1366
+ "cell_type": "code",
1367
+ "execution_count": 435,
1368
+ "id": "cb91ca42-36f7-48bd-8da0-09764bccfded",
1369
+ "metadata": {},
1370
+ "outputs": [],
1371
+ "source": [
1372
+ "xtest_question_from_url='''I want to retrieve hierarchical aggregates of a dataframe, i. e. aggregating the data by an increasing number of grouping variables.'''\n",
1373
+ "question=[]\n",
1374
+ "final_xtest=sahi_karneka_function(xtest_question_from_url)\n",
1375
+ "question.append(final_xtest)\n",
1376
+ "tfidf_xtest_pipeline_question=tfidf_st.transform(question)"
1377
+ ]
1378
+ },
1379
+ {
1380
+ "cell_type": "code",
1381
+ "execution_count": 436,
1382
+ "id": "bc6fff47-cd90-42cd-b366-e78ba78533c2",
1383
+ "metadata": {},
1384
+ "outputs": [],
1385
+ "source": [
1386
+ "y_pred=model.predict(tfidf_xtest_pipeline_question)"
1387
+ ]
1388
+ },
1389
+ {
1390
+ "cell_type": "code",
1391
+ "execution_count": 438,
1392
+ "id": "c4b57c8e-e524-49b2-85e9-f1b16c2349ea",
1393
+ "metadata": {},
1394
+ "outputs": [],
1395
+ "source": [
1396
+ "y_probs=model.predict_proba(tfidf_xtest_pipeline_question)"
1397
+ ]
1398
+ },
1399
+ {
1400
+ "cell_type": "code",
1401
+ "execution_count": 439,
1402
+ "id": "e2bccb82-69a6-4da3-8bd6-efa41c18e36b",
1403
+ "metadata": {},
1404
+ "outputs": [],
1405
+ "source": [
1406
+ "threshold=0.75\n",
1407
+ "probs_column1=np.array([i[0:,1] for i in y_probs]).T\n",
1408
+ "y_pred_customized_prob=(probs_column1>threshold).astype(int)"
1409
+ ]
1410
+ },
1411
+ {
1412
+ "cell_type": "code",
1413
+ "execution_count": 440,
1414
+ "id": "e40814c4-2db1-47ff-8dea-4544f45bcae1",
1415
+ "metadata": {},
1416
+ "outputs": [
1417
+ {
1418
+ "name": "stdout",
1419
+ "output_type": "stream",
1420
+ "text": [
1421
+ "Hamming Loss: 0.0\n",
1422
+ "Accuracy Score: 1.0\n"
1423
+ ]
1424
+ }
1425
+ ],
1426
+ "source": [
1427
+ "print(\"Hamming Loss:\",hamming_loss(y_testing_encoded,y_pred_customized_prob))\n",
1428
+ "print(\"Accuracy Score:\",accuracy_score(y_testing_encoded,y_pred_customized_prob))"
1429
+ ]
1430
+ },
1431
+ {
1432
+ "cell_type": "code",
1433
+ "execution_count": 446,
1434
+ "id": "10c08944-83e9-4f81-a7da-03e9bee517d6",
1435
+ "metadata": {},
1436
+ "outputs": [
1437
+ {
1438
+ "data": {
1439
+ "text/plain": [
1440
+ "[('r',)]"
1441
+ ]
1442
+ },
1443
+ "execution_count": 446,
1444
+ "metadata": {},
1445
+ "output_type": "execute_result"
1446
+ }
1447
+ ],
1448
+ "source": [
1449
+ "ml.inverse_transform(y_pred_customized_prob)"
1450
+ ]
1451
+ },
1452
+ {
1453
+ "cell_type": "code",
1454
+ "execution_count": 445,
1455
+ "id": "6802d5f4-e285-426f-87e2-c0341227e971",
1456
+ "metadata": {},
1457
+ "outputs": [
1458
+ {
1459
+ "data": {
1460
+ "text/plain": [
1461
+ "[('r',)]"
1462
+ ]
1463
+ },
1464
+ "execution_count": 445,
1465
+ "metadata": {},
1466
+ "output_type": "execute_result"
1467
+ }
1468
+ ],
1469
+ "source": [
1470
+ "ml.inverse_transform(y_testing_encoded)"
1471
+ ]
1472
+ },
1473
+ {
1474
+ "cell_type": "markdown",
1475
+ "id": "a69c48ab-a178-4231-8b3e-6cec68b6257a",
1476
+ "metadata": {},
1477
+ "source": [
1478
+ "### ------------------------Finish training code and streamlit testing---------------The end--------------------"
1479
+ ]
1480
+ },
1481
+ {
1482
+ "cell_type": "code",
1483
+ "execution_count": 448,
1484
+ "id": "8acac052-cdb3-4b6c-9ce8-c6c6bf88c613",
1485
+ "metadata": {},
1486
+ "outputs": [
1487
+ {
1488
+ "data": {
1489
+ "text/plain": [
1490
+ "['logistic_model.pkl']"
1491
+ ]
1492
+ },
1493
+ "execution_count": 448,
1494
+ "metadata": {},
1495
+ "output_type": "execute_result"
1496
+ }
1497
+ ],
1498
+ "source": [
1499
+ "joblib.dump(model,\"logistic_model.pkl\")"
1500
+ ]
1501
+ },
1502
+ {
1503
+ "cell_type": "code",
1504
+ "execution_count": null,
1505
+ "id": "572472b0-5a36-42ce-b099-83d749ff108a",
1506
+ "metadata": {},
1507
+ "outputs": [],
1508
+ "source": []
1509
+ }
1510
+ ],
1511
+ "metadata": {
1512
+ "kernelspec": {
1513
+ "display_name": "Python 3 (ipykernel)",
1514
+ "language": "python",
1515
+ "name": "python3"
1516
+ },
1517
+ "language_info": {
1518
+ "codemirror_mode": {
1519
+ "name": "ipython",
1520
+ "version": 3
1521
+ },
1522
+ "file_extension": ".py",
1523
+ "mimetype": "text/x-python",
1524
+ "name": "python",
1525
+ "nbconvert_exporter": "python",
1526
+ "pygments_lexer": "ipython3",
1527
+ "version": "3.12.7"
1528
+ }
1529
+ },
1530
+ "nbformat": 4,
1531
+ "nbformat_minor": 5
1532
+ }
final_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
questions_data.ipynb ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 42,
6
+ "id": "78e51fa0-70cf-4f3e-ae19-db8d76a5d7cb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import requests as req\n",
11
+ "from bs4 import BeautifulSoup as b\n",
12
+ "import pandas as pd\n",
13
+ "d={}\n",
14
+ "for i in range(0,110):\n",
15
+ " res=req.get(f\"https://stackoverflow.com/questions?tab=newest&page={i}\")\n",
16
+ " soup1=b(res.text,\"html.parser\")\n",
17
+ " titles=soup1.select(\".s-post-summary\")\n",
18
+ " for k in range(len(titles)):\n",
19
+ " id_data=titles[k].get(\"data-post-id\")\n",
20
+ " href=titles[k].select_one(\".s-link\").get(\"href\")\n",
21
+ " url=r\"https://stackoverflow.com/\"+href\n",
22
+ " d[id_data]=url"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 43,
28
+ "id": "b5415465-7742-4cc1-aa0e-2d28411f06bc",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "df=pd.DataFrame(data=list(d.items()), columns=[\"id\", \"questions_url\"])"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 44,
38
+ "id": "852b7aa1-1a0d-4e42-bebb-4969c9bbf712",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "df.to_csv(r\"C:\\Users\\sss\\Documents\\datas_urls_questions.csv\")"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "a82fe316-1da4-42cb-aae5-f214934eeda8",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": []
52
+ }
53
+ ],
54
+ "metadata": {
55
+ "kernelspec": {
56
+ "display_name": "Python 3 (ipykernel)",
57
+ "language": "python",
58
+ "name": "python3"
59
+ },
60
+ "language_info": {
61
+ "codemirror_mode": {
62
+ "name": "ipython",
63
+ "version": 3
64
+ },
65
+ "file_extension": ".py",
66
+ "mimetype": "text/x-python",
67
+ "name": "python",
68
+ "nbconvert_exporter": "python",
69
+ "pygments_lexer": "ipython3",
70
+ "version": "3.12.7"
71
+ }
72
+ },
73
+ "nbformat": 4,
74
+ "nbformat_minor": 5
75
+ }
tags_data.ipynb ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 54,
6
+ "id": "f46e7606-82b3-4f4b-9d4c-90c760c7a911",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import requests as req\n",
11
+ "from bs4 import BeautifulSoup as b\n",
12
+ "d={}\n",
13
+ "for i in range(0,110):\n",
14
+ " res = req.get(f\"https://stackoverflow.com/questions?tab=newest&page={i}\")\n",
15
+ " soup1 = b(res.text, \"html.parser\")\n",
16
+ " titles = soup1.select(\".s-post-summary\")\n",
17
+ "\n",
18
+ " for k in range(len(titles)):\n",
19
+ " id_datas=titles[k].get(\"data-post-id\")\n",
20
+ " tags = []\n",
21
+ " a=titles[k].select(\".d-inline\")\n",
22
+ " \n",
23
+ " for a_block in a:\n",
24
+ " a_tags = a_block.select(\"a\")\n",
25
+ " for tag in a_tags:\n",
26
+ " tags.append(tag.get_text())\n",
27
+ " d[id_datas]=tags"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 55,
33
+ "id": "227bd7d9-ba34-4a8e-8ab8-ba62a582f28e",
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "df=pd.DataFrame(data=list(d.items()), columns=[\"id\", \"tags\"])"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 58,
43
+ "id": "6df8c18a-2369-4b5d-807b-903786a21675",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "df.to_csv(r\"C:\\Users\\sss\\Desktop\\datas_tags.csv\")"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "id": "298e0e6d-5882-4ad8-bb7f-e206530dfda8",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": []
57
+ }
58
+ ],
59
+ "metadata": {
60
+ "kernelspec": {
61
+ "display_name": "Python 3 (ipykernel)",
62
+ "language": "python",
63
+ "name": "python3"
64
+ },
65
+ "language_info": {
66
+ "codemirror_mode": {
67
+ "name": "ipython",
68
+ "version": 3
69
+ },
70
+ "file_extension": ".py",
71
+ "mimetype": "text/x-python",
72
+ "name": "python",
73
+ "nbconvert_exporter": "python",
74
+ "pygments_lexer": "ipython3",
75
+ "version": "3.12.7"
76
+ }
77
+ },
78
+ "nbformat": 4,
79
+ "nbformat_minor": 5
80
+ }
titles_data.ipynb ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 478,
6
+ "id": "67ac2431-6f18-4fdc-ba79-1ca3b3c5881b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import requests as req\n",
11
+ "from bs4 import BeautifulSoup as b\n",
12
+ "import pandas as pd\n",
13
+ "d={}\n",
14
+ "for i in range(0,110):\n",
15
+ " res=req.get(f\"https://stackoverflow.com/questions?tab=newest&page={i}\")\n",
16
+ " soup1=b(res.text,\"html.parser\")\n",
17
+ " titles=soup1.select(\".s-post-summary\")\n",
18
+ " for k in range(len(titles)):\n",
19
+ " id_data=titles[k].get(\"data-post-id\")\n",
20
+ " s=titles[k].select_one(\".s-link\").get_text()\n",
21
+ " d[id_data]=s"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 480,
27
+ "id": "ab3eac5e-0ec7-4e5e-94a5-3336ec1f5aa0",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "df=pd.DataFrame(data=list(d.items()), columns=[\"id\", \"title\"])"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 486,
37
+ "id": "97358e26-d8f2-4a80-ad92-163508541585",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "df.to_csv(r\"C:\\Users\\sss\\Desktop\\datas_titles.csv\")"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "id": "a332562f-707a-4392-a122-ef435457252a",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": []
51
+ }
52
+ ],
53
+ "metadata": {
54
+ "kernelspec": {
55
+ "display_name": "Python 3 (ipykernel)",
56
+ "language": "python",
57
+ "name": "python3"
58
+ },
59
+ "language_info": {
60
+ "codemirror_mode": {
61
+ "name": "ipython",
62
+ "version": 3
63
+ },
64
+ "file_extension": ".py",
65
+ "mimetype": "text/x-python",
66
+ "name": "python",
67
+ "nbconvert_exporter": "python",
68
+ "pygments_lexer": "ipython3",
69
+ "version": "3.12.7"
70
+ }
71
+ },
72
+ "nbformat": 4,
73
+ "nbformat_minor": 5
74
+ }