Introlix commited on
Commit
1631829
·
1 Parent(s): 5bbd4c3
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from fastapi import FastAPI, HTTPException
3
+ import sys
4
+ import os
5
+ import re
6
+ import string
7
+ from nltk.stem.porter import PorterStemmer
8
+ from fastapi.responses import Response
9
+ from fastapi.templating import Jinja2Templates
10
+ from starlette.responses import RedirectResponse
11
+
12
+ from pydantic import BaseModel
13
+
14
+ app = FastAPI()
15
+
16
+ def preprocessing(text):
17
+ text = text.lower().strip()
18
+
19
+ # Replace certain special characters with their string equivalents
20
+ text = text.replace('%', ' percent')
21
+ text = text.replace('$', ' dollar ')
22
+ text = text.replace('₹', ' rupee ')
23
+ text = text.replace('€', ' euro ')
24
+
25
+ # remove html tags
26
+ html_tag_pattern = re.compile(r'<.*?>')
27
+ text = html_tag_pattern.sub('', text)
28
+
29
+ # remove urls
30
+ text = re.sub(r'\s*(?:https?://)?www\.\S*\.[A-Za-z]{2,5}\s*', ' ', text).strip()
31
+
32
+ # Decontracting words
33
+ contractions = {
34
+ "ain't": "am not",
35
+ "aren't": "are not",
36
+ "can't": "can not",
37
+ "can't've": "can not have",
38
+ "'cause": "because",
39
+ "could've": "could have",
40
+ "couldn't": "could not",
41
+ "couldn't've": "could not have",
42
+ "didn't": "did not",
43
+ "doesn't": "does not",
44
+ "don't": "do not",
45
+ "hadn't": "had not",
46
+ "hadn't've": "had not have",
47
+ "hasn't": "has not",
48
+ "haven't": "have not",
49
+ "he'd": "he would",
50
+ "he'd've": "he would have",
51
+ "he'll": "he will",
52
+ "he'll've": "he will have",
53
+ "he's": "he is",
54
+ "how'd": "how did",
55
+ "how'd'y": "how do you",
56
+ "how'll": "how will",
57
+ "how's": "how is",
58
+ "i'd": "i would",
59
+ "i'd've": "i would have",
60
+ "i'll": "i will",
61
+ "i'll've": "i will have",
62
+ "i'm": "i am",
63
+ "i've": "i have",
64
+ "isn't": "is not",
65
+ "it'd": "it would",
66
+ "it'd've": "it would have",
67
+ "it'll": "it will",
68
+ "it'll've": "it will have",
69
+ "it's": "it is",
70
+ "let's": "let us",
71
+ "ma'am": "madam",
72
+ "mayn't": "may not",
73
+ "might've": "might have",
74
+ "mightn't": "might not",
75
+ "mightn't've": "might not have",
76
+ "must've": "must have",
77
+ "mustn't": "must not",
78
+ "mustn't've": "must not have",
79
+ "needn't": "need not",
80
+ "needn't've": "need not have",
81
+ "o'clock": "of the clock",
82
+ "oughtn't": "ought not",
83
+ "oughtn't've": "ought not have",
84
+ "shan't": "shall not",
85
+ "sha'n't": "shall not",
86
+ "shan't've": "shall not have",
87
+ "she'd": "she would",
88
+ "she'd've": "she would have",
89
+ "she'll": "she will",
90
+ "she'll've": "she will have",
91
+ "she's": "she is",
92
+ "should've": "should have",
93
+ "shouldn't": "should not",
94
+ "shouldn't've": "should not have",
95
+ "so've": "so have",
96
+ "so's": "so as",
97
+ "that'd": "that would",
98
+ "that'd've": "that would have",
99
+ "that's": "that is",
100
+ "there'd": "there would",
101
+ "there'd've": "there would have",
102
+ "there's": "there is",
103
+ "they'd": "they would",
104
+ "they'd've": "they would have",
105
+ "they'll": "they will",
106
+ "they'll've": "they will have",
107
+ "they're": "they are",
108
+ "they've": "they have",
109
+ "to've": "to have",
110
+ "wasn't": "was not",
111
+ "we'd": "we would",
112
+ "we'd've": "we would have",
113
+ "we'll": "we will",
114
+ "we'll've": "we will have",
115
+ "we're": "we are",
116
+ "we've": "we have",
117
+ "weren't": "were not",
118
+ "what'll": "what will",
119
+ "what'll've": "what will have",
120
+ "what're": "what are",
121
+ "what's": "what is",
122
+ "what've": "what have",
123
+ "when's": "when is",
124
+ "when've": "when have",
125
+ "where'd": "where did",
126
+ "where's": "where is",
127
+ "where've": "where have",
128
+ "who'll": "who will",
129
+ "who'll've": "who will have",
130
+ "who's": "who is",
131
+ "who've": "who have",
132
+ "why's": "why is",
133
+ "why've": "why have",
134
+ "will've": "will have",
135
+ "won't": "will not",
136
+ "won't've": "will not have",
137
+ "would've": "would have",
138
+ "wouldn't": "would not",
139
+ "wouldn't've": "would not have",
140
+ "y'all": "you all",
141
+ "y'all'd": "you all would",
142
+ "y'all'd've": "you all would have",
143
+ "y'all're": "you all are",
144
+ "y'all've": "you all have",
145
+ "you'd": "you would",
146
+ "you'd've": "you would have",
147
+ "you'll": "you will",
148
+ "you'll've": "you will have",
149
+ "you're": "you are",
150
+ "you've": "you have"
151
+ }
152
+
153
+ q_decontracted = []
154
+
155
+ for word in text.split():
156
+ if word in contractions:
157
+ word = contractions[word]
158
+
159
+ q_decontracted.append(word)
160
+
161
+ text = ' '.join(q_decontracted)
162
+ text = text.replace("'ve", " have")
163
+ text = text.replace("n't", " not")
164
+ text = text.replace("'re", " are")
165
+ text = text.replace("'ll", " will")
166
+
167
+ # remove stop words
168
+ new_text = []
169
+ stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
170
+ for word in text.split():
171
+ if word in stopwords:
172
+ new_text.append('')
173
+ else:
174
+ new_text.append(word)
175
+ x = new_text[:]
176
+ new_text.clear
177
+ text = " ".join(x)
178
+
179
+ # remove punctuation
180
+ punct = string.punctuation
181
+
182
+ text = text.translate(str.maketrans('', '', punct))
183
+
184
+ # remove numbers
185
+ digits = string.digits
186
+ text = text.translate(str.maketrans('', '', digits))
187
+
188
+ # removing some characters
189
+ text = text.replace('’', ' ')
190
+
191
+ text = ' '.join(text.split())
192
+
193
+ # stemming
194
+ ps = PorterStemmer()
195
+
196
+ text = " ".join([ps.stem(word) for word in text.split()])
197
+
198
+ return text
199
+
200
+ model = joblib.load("./model/mnb_classifier.joblib")
201
+ label_encoder = joblib.load("./model/label_encoder.joblib")
202
+ tf_idf = joblib.load("./model/tfidf_vectorizer.joblib")
203
+
204
+ @app.get("/", tags=["authentication"])
205
+ async def index():
206
+ return RedirectResponse(url='/docs')
207
+
208
+ class TextRequest(BaseModel):
209
+ text: str
210
+
211
+ @app.post("/classify/")
212
+ async def classify_route(request: TextRequest):
213
+ try:
214
+ text = request.text
215
+ pre_text = preprocessing(text)
216
+ vec_text = tf_idf.transform([pre_text])
217
+ result = model.predict(vec_text)
218
+
219
+ return {"category": label_encoder.inverse_transform(result)[0]}
220
+ except Exception as e:
221
+ print(e)
222
+ raise HTTPException(status_code=500, detail="Internal Server Error")
model/label_encoder.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9510d0a5b40441b23d5247b0af353cf1ef4a356c452ffdcf276ec96e8afed399
3
+ size 617
model/mnb_classifier.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8956be088854bc4e91b231c4553982db83b60d9a50a595a3105ba4337173c27
3
+ size 6767015
model/tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6328ae0014fafd053dcf3fa1b6090b7d157fb67f6ce64ff3788ebf3332beb0e8
3
+ size 55543711
notebook/basic-model-text-classification.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8877343,"sourceType":"datasetVersion","datasetId":5343463},{"sourceId":8892418,"sourceType":"datasetVersion","datasetId":5347872}],"dockerImageVersionId":30732,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import re\nimport nltk\nimport string\nimport numpy as np\nimport pandas as pd\nimport matplotlib as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:14.693763Z","iopub.execute_input":"2024-08-12T08:44:14.694299Z","iopub.status.idle":"2024-08-12T08:44:19.173465Z","shell.execute_reply.started":"2024-08-12T08:44:14.694246Z","shell.execute_reply":"2024-08-12T08:44:19.171816Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"train_df = pd.read_csv(\"/kaggle/input/abc-mn-dataset/train_data.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/abc-mn-dataset/test_data.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:19.175546Z","iopub.execute_input":"2024-08-12T08:44:19.176077Z","iopub.status.idle":"2024-08-12T08:44:20.347463Z","shell.execute_reply.started":"2024-08-12T08:44:19.176041Z","shell.execute_reply":"2024-08-12T08:44:20.346047Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"train_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.349126Z","iopub.execute_input":"2024-08-12T08:44:20.349504Z","iopub.status.idle":"2024-08-12T08:44:20.375925Z","shell.execute_reply.started":"2024-08-12T08:44:20.349473Z","shell.execute_reply":"2024-08-12T08:44:20.374365Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":" text label\n0 A Leaking Oil on Refinery St. Croix Biden Give... nature\n1 Practical Steps To Build Transparency In Busin... coding\n2 How to Convert Image Runway into Video using Ml? ml\n3 Design: Principles Visual And Direction Weight coding\n4 California Permanent Enacts for Protections Tr... nature","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A Leaking Oil on Refinery St. Croix Biden Give...</td>\n <td>nature</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Practical Steps To Build Transparency In Busin...</td>\n <td>coding</td>\n </tr>\n <tr>\n <th>2</th>\n <td>How to Convert Image Runway into Video using Ml?</td>\n <td>ml</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Design: Principles Visual And Direction Weight</td>\n <td>coding</td>\n </tr>\n <tr>\n <th>4</th>\n <td>California Permanent Enacts for Protections Tr...</td>\n <td>nature</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.379696Z","iopub.execute_input":"2024-08-12T08:44:20.380199Z","iopub.status.idle":"2024-08-12T08:44:20.392547Z","shell.execute_reply.started":"2024-08-12T08:44:20.380158Z","shell.execute_reply":"2024-08-12T08:44:20.390964Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" text label\n0 Nexen restoring Gulf Mexico production after h... business\n1 Dollar Mostly Down After Early Gain NEW YORK ... business\n2 The AI-Generated Child Abuse Nightmare Is Here AI\n3 Johnny Depp Says He's No Heartthrob LONDON - J... world\n4 Busch pulls out Cup title When his right-front... sports","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Nexen restoring Gulf Mexico production after h...</td>\n <td>business</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Dollar Mostly Down After Early Gain NEW YORK ...</td>\n <td>business</td>\n </tr>\n <tr>\n <th>2</th>\n <td>The AI-Generated Child Abuse Nightmare Is Here</td>\n <td>AI</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Johnny Depp Says He's No Heartthrob LONDON - J...</td>\n <td>world</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Busch pulls out Cup title When his right-front...</td>\n <td>sports</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.drop_duplicates(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.394389Z","iopub.execute_input":"2024-08-12T08:44:20.394883Z","iopub.status.idle":"2024-08-12T08:44:20.544157Z","shell.execute_reply.started":"2024-08-12T08:44:20.394834Z","shell.execute_reply":"2024-08-12T08:44:20.542719Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"test_df.drop_duplicates(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.546049Z","iopub.execute_input":"2024-08-12T08:44:20.547821Z","iopub.status.idle":"2024-08-12T08:44:20.600328Z","shell.execute_reply.started":"2024-08-12T08:44:20.547717Z","shell.execute_reply":"2024-08-12T08:44:20.598880Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":"## **1. EDA**","metadata":{}},{"cell_type":"code","source":"train_df.info()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.601757Z","iopub.execute_input":"2024-08-12T08:44:20.602217Z","iopub.status.idle":"2024-08-12T08:44:20.656083Z","shell.execute_reply.started":"2024-08-12T08:44:20.602170Z","shell.execute_reply":"2024-08-12T08:44:20.654533Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nIndex: 120097 entries, 0 to 120289\nData columns (total 2 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 text 120097 non-null object\n 1 label 120097 non-null object\ndtypes: object(2)\nmemory usage: 2.7+ MB\n","output_type":"stream"}]},{"cell_type":"code","source":"test_df.info()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.657882Z","iopub.execute_input":"2024-08-12T08:44:20.658368Z","iopub.status.idle":"2024-08-12T08:44:20.684731Z","shell.execute_reply.started":"2024-08-12T08:44:20.658324Z","shell.execute_reply":"2024-08-12T08:44:20.683096Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 44893 entries, 0 to 44892\nData columns (total 2 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 text 44893 non-null object\n 1 label 44893 non-null object\ndtypes: object(2)\nmemory usage: 701.6+ KB\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df['label'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.686777Z","iopub.execute_input":"2024-08-12T08:44:20.687203Z","iopub.status.idle":"2024-08-12T08:44:20.721891Z","shell.execute_reply.started":"2024-08-12T08:44:20.687161Z","shell.execute_reply":"2024-08-12T08:44:20.720399Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"label\nsports 22339\nsci/tech 22336\nbusiness 22299\nworld 22229\ncoding 9193\nnature 7683\nAI 7260\nml 6758\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"# finding if it contains html tags\ndef contains_html_tags_regex(text):\n html_tag_pattern = re.compile(r'<[^>]+>')\n if bool(html_tag_pattern.search(text)) == True:\n print(\"HTML Found!!\")\n\ntrain_df['text'].apply(contains_html_tags_regex).sum()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.727199Z","iopub.execute_input":"2024-08-12T08:44:20.727635Z","iopub.status.idle":"2024-08-12T08:44:21.007547Z","shell.execute_reply.started":"2024-08-12T08:44:20.727594Z","shell.execute_reply":"2024-08-12T08:44:21.006242Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"HTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\n","output_type":"stream"},{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"code","source":"# finding if it contains emails\ndef contains_emails(text):\n email_pattern = re.compile(r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,7}\\b')\n if bool(email_pattern.search(text)) == True:\n print(\"URL Found!!\")\n\ntrain_df['text'].apply(contains_emails).sum()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:21.009343Z","iopub.execute_input":"2024-08-12T08:44:21.009983Z","iopub.status.idle":"2024-08-12T08:44:22.364059Z","shell.execute_reply.started":"2024-08-12T08:44:21.009932Z","shell.execute_reply":"2024-08-12T08:44:22.362689Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"markdown","source":"## **2. Preprocessing**","metadata":{}},{"cell_type":"code","source":"from nltk.stem.porter import PorterStemmer\n\ndef preprocessing(text):\n text = text.lower().strip()\n\n # Replace certain special characters with their string equivalents\n text = text.replace('%', ' percent')\n text = text.replace('$', ' dollar ')\n text = text.replace('₹', ' rupee ')\n text = text.replace('€', ' euro ')\n\n # remove html tags\n html_tag_pattern = re.compile(r'<.*?>')\n text = html_tag_pattern.sub('', text)\n\n # remove urls\n text = re.sub(r'\\s*(?:https?://)?www\\.\\S*\\.[A-Za-z]{2,5}\\s*', ' ', text).strip()\n\n # Decontracting words\n contractions = { \n \"ain't\": \"am not\",\n \"aren't\": \"are not\",\n \"can't\": \"can not\",\n \"can't've\": \"can not have\",\n \"'cause\": \"because\",\n \"could've\": \"could have\",\n \"couldn't\": \"could not\",\n \"couldn't've\": \"could not have\",\n \"didn't\": \"did not\",\n \"doesn't\": \"does not\",\n \"don't\": \"do not\",\n \"hadn't\": \"had not\",\n \"hadn't've\": \"had not have\",\n \"hasn't\": \"has not\",\n \"haven't\": \"have not\",\n \"he'd\": \"he would\",\n \"he'd've\": \"he would have\",\n \"he'll\": \"he will\",\n \"he'll've\": \"he will have\",\n \"he's\": \"he is\",\n \"how'd\": \"how did\",\n \"how'd'y\": \"how do you\",\n \"how'll\": \"how will\",\n \"how's\": \"how is\",\n \"i'd\": \"i would\",\n \"i'd've\": \"i would have\",\n \"i'll\": \"i will\",\n \"i'll've\": \"i will have\",\n \"i'm\": \"i am\",\n \"i've\": \"i have\",\n \"isn't\": \"is not\",\n \"it'd\": \"it would\",\n \"it'd've\": \"it would have\",\n \"it'll\": \"it will\",\n \"it'll've\": \"it will have\",\n \"it's\": \"it is\",\n \"let's\": \"let us\",\n \"ma'am\": \"madam\",\n \"mayn't\": \"may not\",\n \"might've\": \"might have\",\n \"mightn't\": \"might not\",\n \"mightn't've\": \"might not have\",\n \"must've\": \"must have\",\n \"mustn't\": \"must not\",\n \"mustn't've\": \"must not have\",\n \"needn't\": \"need not\",\n \"needn't've\": \"need not have\",\n \"o'clock\": \"of the clock\",\n \"oughtn't\": \"ought not\",\n \"oughtn't've\": \"ought not have\",\n \"shan't\": \"shall not\",\n \"sha'n't\": \"shall not\",\n \"shan't've\": \"shall not have\",\n \"she'd\": \"she would\",\n \"she'd've\": \"she would have\",\n \"she'll\": \"she will\",\n \"she'll've\": \"she will have\",\n \"she's\": \"she is\",\n \"should've\": \"should have\",\n \"shouldn't\": \"should not\",\n \"shouldn't've\": \"should not have\",\n \"so've\": \"so have\",\n \"so's\": \"so as\",\n \"that'd\": \"that would\",\n \"that'd've\": \"that would have\",\n \"that's\": \"that is\",\n \"there'd\": \"there would\",\n \"there'd've\": \"there would have\",\n \"there's\": \"there is\",\n \"they'd\": \"they would\",\n \"they'd've\": \"they would have\",\n \"they'll\": \"they will\",\n \"they'll've\": \"they will have\",\n \"they're\": \"they are\",\n \"they've\": \"they have\",\n \"to've\": \"to have\",\n \"wasn't\": \"was not\",\n \"we'd\": \"we would\",\n \"we'd've\": \"we would have\",\n \"we'll\": \"we will\",\n \"we'll've\": \"we will have\",\n \"we're\": \"we are\",\n \"we've\": \"we have\",\n \"weren't\": \"were not\",\n \"what'll\": \"what will\",\n \"what'll've\": \"what will have\",\n \"what're\": \"what are\",\n \"what's\": \"what is\",\n \"what've\": \"what have\",\n \"when's\": \"when is\",\n \"when've\": \"when have\",\n \"where'd\": \"where did\",\n \"where's\": \"where is\",\n \"where've\": \"where have\",\n \"who'll\": \"who will\",\n \"who'll've\": \"who will have\",\n \"who's\": \"who is\",\n \"who've\": \"who have\",\n \"why's\": \"why is\",\n \"why've\": \"why have\",\n \"will've\": \"will have\",\n \"won't\": \"will not\",\n \"won't've\": \"will not have\",\n \"would've\": \"would have\",\n \"wouldn't\": \"would not\",\n \"wouldn't've\": \"would not have\",\n \"y'all\": \"you all\",\n \"y'all'd\": \"you all would\",\n \"y'all'd've\": \"you all would have\",\n \"y'all're\": \"you all are\",\n \"y'all've\": \"you all have\",\n \"you'd\": \"you would\",\n \"you'd've\": \"you would have\",\n \"you'll\": \"you will\",\n \"you'll've\": \"you will have\",\n \"you're\": \"you are\",\n \"you've\": \"you have\"\n }\n\n q_decontracted = []\n\n for word in text.split():\n if word in contractions:\n word = contractions[word]\n\n q_decontracted.append(word)\n\n text = ' '.join(q_decontracted)\n text = text.replace(\"'ve\", \" have\")\n text = text.replace(\"n't\", \" not\")\n text = text.replace(\"'re\", \" are\")\n text = text.replace(\"'ll\", \" will\")\n\n # remove stop words\n new_text = []\n stopwords = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]\n for word in text.split():\n if word in stopwords:\n new_text.append('')\n else:\n new_text.append(word)\n x = new_text[:]\n new_text.clear\n text = \" \".join(x)\n\n # remove punctuation\n punct = string.punctuation\n\n text = text.translate(str.maketrans('', '', punct))\n \n # remove numbers\n digits = string.digits\n text = text.translate(str.maketrans('', '', digits))\n \n # removing some characters\n text = text.replace('’', ' ')\n\n text = ' '.join(text.split())\n \n # stemming\n ps = PorterStemmer()\n \n text = \" \".join([ps.stem(word) for word in text.split()])\n \n return text","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.365805Z","iopub.execute_input":"2024-08-12T08:44:22.366302Z","iopub.status.idle":"2024-08-12T08:44:22.401350Z","shell.execute_reply.started":"2024-08-12T08:44:22.366255Z","shell.execute_reply":"2024-08-12T08:44:22.399987Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"preprocessing(\"’ s lightmatter photonic ambitions light AI up an $ 80M B round\")","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.402898Z","iopub.execute_input":"2024-08-12T08:44:22.403367Z","iopub.status.idle":"2024-08-12T08:44:22.425765Z","shell.execute_reply.started":"2024-08-12T08:44:22.403329Z","shell.execute_reply":"2024-08-12T08:44:22.424455Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"'lightmatt photon ambit light ai dollar m b round'"},"metadata":{}}]},{"cell_type":"code","source":"train_df['text'] = train_df['text'].apply(preprocessing)\ntest_df['text'] = test_df['text'].apply(preprocessing)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.427231Z","iopub.execute_input":"2024-08-12T08:44:22.427635Z","iopub.status.idle":"2024-08-12T08:47:08.687108Z","shell.execute_reply.started":"2024-08-12T08:44:22.427591Z","shell.execute_reply":"2024-08-12T08:47:08.686113Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"print(train_df.shape)\nprint(test_df.shape)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.688493Z","iopub.execute_input":"2024-08-12T08:47:08.688873Z","iopub.status.idle":"2024-08-12T08:47:08.695313Z","shell.execute_reply.started":"2024-08-12T08:47:08.688840Z","shell.execute_reply":"2024-08-12T08:47:08.693855Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"(120097, 2)\n(44893, 2)\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.696761Z","iopub.execute_input":"2024-08-12T08:47:08.697338Z","iopub.status.idle":"2024-08-12T08:47:08.719350Z","shell.execute_reply.started":"2024-08-12T08:47:08.697291Z","shell.execute_reply":"2024-08-12T08:47:08.717950Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":" text label\n0 leak oil refineri st croix biden give environm... nature\n1 practic step build transpar busi remot coding\n2 convert imag runway video use ml ml\n3 design principl visual direct weight coding\n4 california perman enact protect tree joshua nature","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>text</th>\n <th>label</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>leak oil refineri st croix biden give environm...</td>\n <td>nature</td>\n </tr>\n <tr>\n <th>1</th>\n <td>practic step build transpar busi remot</td>\n <td>coding</td>\n </tr>\n <tr>\n <th>2</th>\n <td>convert imag runway video use ml</td>\n <td>ml</td>\n </tr>\n <tr>\n <th>3</th>\n <td>design principl visual direct weight</td>\n <td>coding</td>\n </tr>\n <tr>\n <th>4</th>\n <td>california perman enact protect tree joshua</td>\n <td>nature</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.dropna(inplace=True)\ntest_df.dropna(inplace=True)\ntrain_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.720814Z","iopub.execute_input":"2024-08-12T08:47:08.721233Z","iopub.status.idle":"2024-08-12T08:47:08.795264Z","shell.execute_reply.started":"2024-08-12T08:47:08.721200Z","shell.execute_reply":"2024-08-12T08:47:08.794181Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"(120097, 2)"},"metadata":{}}]},{"cell_type":"markdown","source":"## **3. Preparing Dataset For Training**","metadata":{}},{"cell_type":"markdown","source":"### **3.1. Extracting Features From The Dataset**","metadata":{}},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer\n\ntfidf = TfidfVectorizer(min_df=8, ngram_range=(1, 3))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.796819Z","iopub.execute_input":"2024-08-12T08:47:08.797729Z","iopub.status.idle":"2024-08-12T08:47:08.803184Z","shell.execute_reply.started":"2024-08-12T08:47:08.797687Z","shell.execute_reply":"2024-08-12T08:47:08.801863Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"# using tfidf to extract features from the dataset\ntrain_text_vector = tfidf.fit_transform(train_df['text']).toarray()\ntest_text_vector = tfidf.transform(test_df['text']).toarray()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.804636Z","iopub.execute_input":"2024-08-12T08:47:08.805074Z","iopub.status.idle":"2024-08-12T08:47:45.635014Z","shell.execute_reply.started":"2024-08-12T08:47:08.805043Z","shell.execute_reply":"2024-08-12T08:47:45.633422Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"train_text_vector","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.636630Z","iopub.execute_input":"2024-08-12T08:47:45.637207Z","iopub.status.idle":"2024-08-12T08:47:45.647057Z","shell.execute_reply.started":"2024-08-12T08:47:45.637162Z","shell.execute_reply":"2024-08-12T08:47:45.645459Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"array([[0., 0., 0., ..., 0., 0., 0.],\n [0., 0., 0., ..., 0., 0., 0.],\n [0., 0., 0., ..., 0., 0., 0.],\n ...,\n [0., 0., 0., ..., 0., 0., 0.],\n [0., 0., 0., ..., 0., 0., 0.],\n [0., 0., 0., ..., 0., 0., 0.]])"},"metadata":{}}]},{"cell_type":"code","source":"# converting the data array into dataframe\ntrain_text_vector_df = pd.DataFrame(train_text_vector, index=train_df.index)\ntest_text_vector_df = pd.DataFrame(test_text_vector, index=test_df.index)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.648219Z","iopub.execute_input":"2024-08-12T08:47:45.648646Z","iopub.status.idle":"2024-08-12T08:47:45.670509Z","shell.execute_reply.started":"2024-08-12T08:47:45.648611Z","shell.execute_reply":"2024-08-12T08:47:45.669110Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"X_train = train_text_vector_df\ny_train = train_df['label']","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.672440Z","iopub.execute_input":"2024-08-12T08:47:45.672947Z","iopub.status.idle":"2024-08-12T08:47:45.688735Z","shell.execute_reply.started":"2024-08-12T08:47:45.672911Z","shell.execute_reply":"2024-08-12T08:47:45.687204Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"train_text_vector_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.690102Z","iopub.execute_input":"2024-08-12T08:47:45.690587Z","iopub.status.idle":"2024-08-12T08:47:45.708181Z","shell.execute_reply.started":"2024-08-12T08:47:45.690538Z","shell.execute_reply":"2024-08-12T08:47:45.706811Z"},"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"(120097, 52860)"},"metadata":{}}]},{"cell_type":"code","source":"test_text_vector_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.709713Z","iopub.execute_input":"2024-08-12T08:47:45.710249Z","iopub.status.idle":"2024-08-12T08:47:45.726817Z","shell.execute_reply.started":"2024-08-12T08:47:45.710198Z","shell.execute_reply":"2024-08-12T08:47:45.724408Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"(44893, 52860)"},"metadata":{}}]},{"cell_type":"code","source":"X_test = test_text_vector_df\ny_test = test_df['label']","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.729221Z","iopub.execute_input":"2024-08-12T08:47:45.729901Z","iopub.status.idle":"2024-08-12T08:47:45.741643Z","shell.execute_reply.started":"2024-08-12T08:47:45.729863Z","shell.execute_reply":"2024-08-12T08:47:45.740067Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"X_train.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.743355Z","iopub.execute_input":"2024-08-12T08:47:45.743749Z","iopub.status.idle":"2024-08-12T08:47:45.798489Z","shell.execute_reply.started":"2024-08-12T08:47:45.743715Z","shell.execute_reply":"2024-08-12T08:47:45.796890Z"},"trusted":true},"execution_count":26,"outputs":[{"execution_count":26,"output_type":"execute_result","data":{"text/plain":" 0 1 2 3 4 5 6 7 8 9 ... \\\n0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n\n 52850 52851 52852 52853 52854 52855 52856 52857 52858 52859 \n0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n\n[5 rows x 52860 columns]","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>...</th>\n <th>52850</th>\n <th>52851</th>\n <th>52852</th>\n <th>52853</th>\n <th>52854</th>\n <th>52855</th>\n <th>52856</th>\n <th>52857</th>\n <th>52858</th>\n <th>52859</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 52860 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"y_train.unique()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.800281Z","iopub.execute_input":"2024-08-12T08:47:45.800776Z","iopub.status.idle":"2024-08-12T08:47:45.821025Z","shell.execute_reply.started":"2024-08-12T08:47:45.800731Z","shell.execute_reply":"2024-08-12T08:47:45.819503Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"array(['nature', 'coding', 'ml', 'AI', 'business', 'world', 'sports',\n 'sci/tech'], dtype=object)"},"metadata":{}}]},{"cell_type":"markdown","source":"### **3.2. Encoding Labels**","metadata":{}},{"cell_type":"code","source":"from sklearn.preprocessing import LabelEncoder\n\n# Initialize the encoder\nlabel_encoder = LabelEncoder()\n\n# Fit and transform the labels\ny_train = label_encoder.fit_transform(y_train)\ny_test = label_encoder.fit_transform(y_test)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.827700Z","iopub.execute_input":"2024-08-12T08:47:45.828140Z","iopub.status.idle":"2024-08-12T08:47:45.882509Z","shell.execute_reply.started":"2024-08-12T08:47:45.828105Z","shell.execute_reply":"2024-08-12T08:47:45.881298Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"y_train","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.884175Z","iopub.execute_input":"2024-08-12T08:47:45.884634Z","iopub.status.idle":"2024-08-12T08:47:45.892209Z","shell.execute_reply.started":"2024-08-12T08:47:45.884580Z","shell.execute_reply":"2024-08-12T08:47:45.891081Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"array([4, 2, 3, ..., 6, 5, 7])"},"metadata":{}}]},{"cell_type":"markdown","source":"## **4. Model Training**","metadata":{}},{"cell_type":"markdown","source":"### **4.1. Naive Bayes**","metadata":{}},{"cell_type":"code","source":"from sklearn.naive_bayes import MultinomialNB, BernoulliNB","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.893669Z","iopub.execute_input":"2024-08-12T08:47:45.894086Z","iopub.status.idle":"2024-08-12T08:47:45.912017Z","shell.execute_reply.started":"2024-08-12T08:47:45.894027Z","shell.execute_reply":"2024-08-12T08:47:45.910827Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"mnb_classifier = MultinomialNB()\nmnb_classifier.fit(X_train, y_train)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.914105Z","iopub.execute_input":"2024-08-12T08:47:45.914605Z","iopub.status.idle":"2024-08-12T08:48:13.583967Z","shell.execute_reply.started":"2024-08-12T08:47:45.914559Z","shell.execute_reply":"2024-08-12T08:48:13.582439Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"MultinomialNB()","text/html":"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MultinomialNB</label><div class=\"sk-toggleable__content\"><pre>MultinomialNB()</pre></div></div></div></div></div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## **5. Evaluate the models**","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score, classification_report\nmnb_predictions = mnb_classifier.predict(X_test)\nmnb_accuracy = accuracy_score(y_test, mnb_predictions)\nprint(\"Multinomial Naïve Bayes Accuracy:\", mnb_accuracy)\nprint(\"Classification Report:\")\nprint(classification_report(y_test, mnb_predictions))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:13.586179Z","iopub.execute_input":"2024-08-12T08:48:13.594164Z","iopub.status.idle":"2024-08-12T08:48:23.832059Z","shell.execute_reply.started":"2024-08-12T08:48:13.594092Z","shell.execute_reply":"2024-08-12T08:48:23.830782Z"},"trusted":true},"execution_count":32,"outputs":[{"name":"stdout","text":"Multinomial Naïve Bayes Accuracy: 0.8872430000222752\nClassification Report:\n precision recall f1-score support\n\n 0 0.94 0.74 0.83 1567\n 1 0.87 0.87 0.87 9601\n 2 0.93 0.77 0.84 1932\n 3 0.92 0.87 0.90 1404\n 4 0.90 0.70 0.78 1593\n 5 0.81 0.88 0.85 9564\n 6 0.93 0.98 0.96 9561\n 7 0.92 0.90 0.91 9671\n\n accuracy 0.89 44893\n macro avg 0.90 0.84 0.87 44893\nweighted avg 0.89 0.89 0.89 44893\n\n","output_type":"stream"}]},{"cell_type":"code","source":"# testing the best model\ntext = \"US jobs growth in June beats expectations\"\n\nprepro_text = preprocessing(text)\nvec_text = tfidf.transform([prepro_text])\nresult = mnb_classifier.predict(vec_text)\nprint(label_encoder.inverse_transform(result))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:23.833408Z","iopub.execute_input":"2024-08-12T08:48:23.833758Z","iopub.status.idle":"2024-08-12T08:48:23.845561Z","shell.execute_reply.started":"2024-08-12T08:48:23.833725Z","shell.execute_reply":"2024-08-12T08:48:23.844142Z"},"trusted":true},"execution_count":33,"outputs":[{"name":"stdout","text":"['business']\n","output_type":"stream"}]},{"cell_type":"code","source":"# saving the best model\nimport joblib\n\njoblib.dump(mnb_classifier, 'mnb_classifier.joblib')\njoblib.dump(tfidf, 'tfidf_vectorizer.joblib')\njoblib.dump(label_encoder, 'label_encoder.joblib')","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:23.846861Z","iopub.execute_input":"2024-08-12T08:48:23.847303Z","iopub.status.idle":"2024-08-12T08:48:46.927900Z","shell.execute_reply.started":"2024-08-12T08:48:23.847270Z","shell.execute_reply":"2024-08-12T08:48:46.926786Z"},"trusted":true},"execution_count":34,"outputs":[{"execution_count":34,"output_type":"execute_result","data":{"text/plain":"['label_encoder.joblib']"},"metadata":{}}]},{"cell_type":"markdown","source":"### **4.2. Random Forest**","metadata":{"execution":{"iopub.status.busy":"2024-07-06T08:14:01.195834Z","iopub.execute_input":"2024-07-06T08:14:01.196660Z","iopub.status.idle":"2024-07-06T08:14:01.201176Z","shell.execute_reply.started":"2024-07-06T08:14:01.196624Z","shell.execute_reply":"2024-07-06T08:14:01.199962Z"}}},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score\nrf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)\nrf.fit(X_train, y_train)\ny_pred = rf.predict(X_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:46.929334Z","iopub.execute_input":"2024-08-12T08:48:46.929685Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from xgboost import XGBClassifier\n\nxgb_model = XGBClassifier().fit(X_train, y_train)\n\n# predict\nxgb_y_predict = xgb_model.predict(X_test)\n\n# accuracy score\nxgb_score = accuracy_score(xgb_y_predict, y_test)\n\nprint('Accuracy score is:', xgb_score)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ matplotlib
4
+ seaborn
5
+ scikit-learn
6
+ nltk
7
+ fastapi
8
+ uvicorn[standard]
setup.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import find_packages, setup
2
+ from typing import List
3
+
4
+ def get_requirements(filepath: str)->List[str]:
5
+ """
6
+ This function will return list of requirements
7
+ """
8
+
9
+ requirements = []
10
+
11
+ with open(filepath) as f:
12
+ requirements = f.readlines()
13
+ [req.replace("\n", "") for req in requirements]
14
+
15
+ if '-e .' in requirements:
16
+ requirements.remove('-e .')
17
+
18
+ return requirements
19
+
20
+ setup(
21
+ name="FeedClassify",
22
+ version="0.0.1",
23
+ author="Introlix",
24
+ packages=find_packages(),
25
+ install_requires=get_requirements('requirements.txt')
26
+ )