File size: 42,820 Bytes
1631829
1
{"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":8877343,"sourceType":"datasetVersion","datasetId":5343463},{"sourceId":8892418,"sourceType":"datasetVersion","datasetId":5347872}],"dockerImageVersionId":30732,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import re\nimport nltk\nimport string\nimport numpy as np\nimport pandas as pd\nimport matplotlib as plt\nimport seaborn as sns","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:14.693763Z","iopub.execute_input":"2024-08-12T08:44:14.694299Z","iopub.status.idle":"2024-08-12T08:44:19.173465Z","shell.execute_reply.started":"2024-08-12T08:44:14.694246Z","shell.execute_reply":"2024-08-12T08:44:19.171816Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"train_df = pd.read_csv(\"/kaggle/input/abc-mn-dataset/train_data.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/abc-mn-dataset/test_data.csv\")","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:19.175546Z","iopub.execute_input":"2024-08-12T08:44:19.176077Z","iopub.status.idle":"2024-08-12T08:44:20.347463Z","shell.execute_reply.started":"2024-08-12T08:44:19.176041Z","shell.execute_reply":"2024-08-12T08:44:20.346047Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"train_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.349126Z","iopub.execute_input":"2024-08-12T08:44:20.349504Z","iopub.status.idle":"2024-08-12T08:44:20.375925Z","shell.execute_reply.started":"2024-08-12T08:44:20.349473Z","shell.execute_reply":"2024-08-12T08:44:20.374365Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"                                                text   label\n0  A Leaking Oil on Refinery St. Croix Biden Give...  nature\n1  Practical Steps To Build Transparency In Busin...  coding\n2   How to Convert Image Runway into Video using Ml?      ml\n3     Design: Principles Visual And Direction Weight  coding\n4  California Permanent Enacts for Protections Tr...  nature","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>text</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>A Leaking Oil on Refinery St. Croix Biden Give...</td>\n      <td>nature</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Practical Steps To Build Transparency In Busin...</td>\n      <td>coding</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>How to Convert Image Runway into Video using Ml?</td>\n      <td>ml</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Design: Principles Visual And Direction Weight</td>\n      <td>coding</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>California Permanent Enacts for Protections Tr...</td>\n      <td>nature</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.379696Z","iopub.execute_input":"2024-08-12T08:44:20.380199Z","iopub.status.idle":"2024-08-12T08:44:20.392547Z","shell.execute_reply.started":"2024-08-12T08:44:20.380158Z","shell.execute_reply":"2024-08-12T08:44:20.390964Z"},"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"                                                text     label\n0  Nexen restoring Gulf Mexico production after h...  business\n1  Dollar Mostly Down After Early Gain  NEW YORK ...  business\n2     The AI-Generated Child Abuse Nightmare Is Here        AI\n3  Johnny Depp Says He's No Heartthrob LONDON - J...     world\n4  Busch pulls out Cup title When his right-front...    sports","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>text</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Nexen restoring Gulf Mexico production after h...</td>\n      <td>business</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Dollar Mostly Down After Early Gain  NEW YORK ...</td>\n      <td>business</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>The AI-Generated Child Abuse Nightmare Is Here</td>\n      <td>AI</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Johnny Depp Says He's No Heartthrob LONDON - J...</td>\n      <td>world</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Busch pulls out Cup title When his right-front...</td>\n      <td>sports</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.drop_duplicates(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.394389Z","iopub.execute_input":"2024-08-12T08:44:20.394883Z","iopub.status.idle":"2024-08-12T08:44:20.544157Z","shell.execute_reply.started":"2024-08-12T08:44:20.394834Z","shell.execute_reply":"2024-08-12T08:44:20.542719Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"test_df.drop_duplicates(inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.546049Z","iopub.execute_input":"2024-08-12T08:44:20.547821Z","iopub.status.idle":"2024-08-12T08:44:20.600328Z","shell.execute_reply.started":"2024-08-12T08:44:20.547717Z","shell.execute_reply":"2024-08-12T08:44:20.598880Z"},"trusted":true},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":"## **1. EDA**","metadata":{}},{"cell_type":"code","source":"train_df.info()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.601757Z","iopub.execute_input":"2024-08-12T08:44:20.602217Z","iopub.status.idle":"2024-08-12T08:44:20.656083Z","shell.execute_reply.started":"2024-08-12T08:44:20.602170Z","shell.execute_reply":"2024-08-12T08:44:20.654533Z"},"trusted":true},"execution_count":7,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nIndex: 120097 entries, 0 to 120289\nData columns (total 2 columns):\n #   Column  Non-Null Count   Dtype \n---  ------  --------------   ----- \n 0   text    120097 non-null  object\n 1   label   120097 non-null  object\ndtypes: object(2)\nmemory usage: 2.7+ MB\n","output_type":"stream"}]},{"cell_type":"code","source":"test_df.info()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.657882Z","iopub.execute_input":"2024-08-12T08:44:20.658368Z","iopub.status.idle":"2024-08-12T08:44:20.684731Z","shell.execute_reply.started":"2024-08-12T08:44:20.658324Z","shell.execute_reply":"2024-08-12T08:44:20.683096Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 44893 entries, 0 to 44892\nData columns (total 2 columns):\n #   Column  Non-Null Count  Dtype \n---  ------  --------------  ----- \n 0   text    44893 non-null  object\n 1   label   44893 non-null  object\ndtypes: object(2)\nmemory usage: 701.6+ KB\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df['label'].value_counts()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.686777Z","iopub.execute_input":"2024-08-12T08:44:20.687203Z","iopub.status.idle":"2024-08-12T08:44:20.721891Z","shell.execute_reply.started":"2024-08-12T08:44:20.687161Z","shell.execute_reply":"2024-08-12T08:44:20.720399Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":"label\nsports      22339\nsci/tech    22336\nbusiness    22299\nworld       22229\ncoding       9193\nnature       7683\nAI           7260\nml           6758\nName: count, dtype: int64"},"metadata":{}}]},{"cell_type":"code","source":"# finding if it contains html tags\ndef contains_html_tags_regex(text):\n    html_tag_pattern = re.compile(r'<[^>]+>')\n    if bool(html_tag_pattern.search(text)) == True:\n        print(\"HTML Found!!\")\n\ntrain_df['text'].apply(contains_html_tags_regex).sum()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:20.727199Z","iopub.execute_input":"2024-08-12T08:44:20.727635Z","iopub.status.idle":"2024-08-12T08:44:21.007547Z","shell.execute_reply.started":"2024-08-12T08:44:20.727594Z","shell.execute_reply":"2024-08-12T08:44:21.006242Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"HTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\nHTML Found!!\n","output_type":"stream"},{"execution_count":10,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"code","source":"# finding if it contains emails\ndef contains_emails(text):\n    email_pattern = re.compile(r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,7}\\b')\n    if bool(email_pattern.search(text)) == True:\n        print(\"URL Found!!\")\n\ntrain_df['text'].apply(contains_emails).sum()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:21.009343Z","iopub.execute_input":"2024-08-12T08:44:21.009983Z","iopub.status.idle":"2024-08-12T08:44:22.364059Z","shell.execute_reply.started":"2024-08-12T08:44:21.009932Z","shell.execute_reply":"2024-08-12T08:44:22.362689Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{}}]},{"cell_type":"markdown","source":"## **2. Preprocessing**","metadata":{}},{"cell_type":"code","source":"from nltk.stem.porter import PorterStemmer\n\ndef preprocessing(text):\n    text = text.lower().strip()\n\n    # Replace certain special characters with their string equivalents\n    text = text.replace('%', ' percent')\n    text = text.replace('$', ' dollar ')\n    text = text.replace('₹', ' rupee ')\n    text = text.replace('€', ' euro ')\n\n    # remove html tags\n    html_tag_pattern = re.compile(r'<.*?>')\n    text = html_tag_pattern.sub('', text)\n\n    # remove urls\n    text = re.sub(r'\\s*(?:https?://)?www\\.\\S*\\.[A-Za-z]{2,5}\\s*', ' ', text).strip()\n\n    # Decontracting words\n    contractions = { \n    \"ain't\": \"am not\",\n    \"aren't\": \"are not\",\n    \"can't\": \"can not\",\n    \"can't've\": \"can not have\",\n    \"'cause\": \"because\",\n    \"could've\": \"could have\",\n    \"couldn't\": \"could not\",\n    \"couldn't've\": \"could not have\",\n    \"didn't\": \"did not\",\n    \"doesn't\": \"does not\",\n    \"don't\": \"do not\",\n    \"hadn't\": \"had not\",\n    \"hadn't've\": \"had not have\",\n    \"hasn't\": \"has not\",\n    \"haven't\": \"have not\",\n    \"he'd\": \"he would\",\n    \"he'd've\": \"he would have\",\n    \"he'll\": \"he will\",\n    \"he'll've\": \"he will have\",\n    \"he's\": \"he is\",\n    \"how'd\": \"how did\",\n    \"how'd'y\": \"how do you\",\n    \"how'll\": \"how will\",\n    \"how's\": \"how is\",\n    \"i'd\": \"i would\",\n    \"i'd've\": \"i would have\",\n    \"i'll\": \"i will\",\n    \"i'll've\": \"i will have\",\n    \"i'm\": \"i am\",\n    \"i've\": \"i have\",\n    \"isn't\": \"is not\",\n    \"it'd\": \"it would\",\n    \"it'd've\": \"it would have\",\n    \"it'll\": \"it will\",\n    \"it'll've\": \"it will have\",\n    \"it's\": \"it is\",\n    \"let's\": \"let us\",\n    \"ma'am\": \"madam\",\n    \"mayn't\": \"may not\",\n    \"might've\": \"might have\",\n    \"mightn't\": \"might not\",\n    \"mightn't've\": \"might not have\",\n    \"must've\": \"must have\",\n    \"mustn't\": \"must not\",\n    \"mustn't've\": \"must not have\",\n    \"needn't\": \"need not\",\n    \"needn't've\": \"need not have\",\n    \"o'clock\": \"of the clock\",\n    \"oughtn't\": \"ought not\",\n    \"oughtn't've\": \"ought not have\",\n    \"shan't\": \"shall not\",\n    \"sha'n't\": \"shall not\",\n    \"shan't've\": \"shall not have\",\n    \"she'd\": \"she would\",\n    \"she'd've\": \"she would have\",\n    \"she'll\": \"she will\",\n    \"she'll've\": \"she will have\",\n    \"she's\": \"she is\",\n    \"should've\": \"should have\",\n    \"shouldn't\": \"should not\",\n    \"shouldn't've\": \"should not have\",\n    \"so've\": \"so have\",\n    \"so's\": \"so as\",\n    \"that'd\": \"that would\",\n    \"that'd've\": \"that would have\",\n    \"that's\": \"that is\",\n    \"there'd\": \"there would\",\n    \"there'd've\": \"there would have\",\n    \"there's\": \"there is\",\n    \"they'd\": \"they would\",\n    \"they'd've\": \"they would have\",\n    \"they'll\": \"they will\",\n    \"they'll've\": \"they will have\",\n    \"they're\": \"they are\",\n    \"they've\": \"they have\",\n    \"to've\": \"to have\",\n    \"wasn't\": \"was not\",\n    \"we'd\": \"we would\",\n    \"we'd've\": \"we would have\",\n    \"we'll\": \"we will\",\n    \"we'll've\": \"we will have\",\n    \"we're\": \"we are\",\n    \"we've\": \"we have\",\n    \"weren't\": \"were not\",\n    \"what'll\": \"what will\",\n    \"what'll've\": \"what will have\",\n    \"what're\": \"what are\",\n    \"what's\": \"what is\",\n    \"what've\": \"what have\",\n    \"when's\": \"when is\",\n    \"when've\": \"when have\",\n    \"where'd\": \"where did\",\n    \"where's\": \"where is\",\n    \"where've\": \"where have\",\n    \"who'll\": \"who will\",\n    \"who'll've\": \"who will have\",\n    \"who's\": \"who is\",\n    \"who've\": \"who have\",\n    \"why's\": \"why is\",\n    \"why've\": \"why have\",\n    \"will've\": \"will have\",\n    \"won't\": \"will not\",\n    \"won't've\": \"will not have\",\n    \"would've\": \"would have\",\n    \"wouldn't\": \"would not\",\n    \"wouldn't've\": \"would not have\",\n    \"y'all\": \"you all\",\n    \"y'all'd\": \"you all would\",\n    \"y'all'd've\": \"you all would have\",\n    \"y'all're\": \"you all are\",\n    \"y'all've\": \"you all have\",\n    \"you'd\": \"you would\",\n    \"you'd've\": \"you would have\",\n    \"you'll\": \"you will\",\n    \"you'll've\": \"you will have\",\n    \"you're\": \"you are\",\n    \"you've\": \"you have\"\n    }\n\n    q_decontracted = []\n\n    for word in text.split():\n        if word in contractions:\n            word = contractions[word]\n\n        q_decontracted.append(word)\n\n    text = ' '.join(q_decontracted)\n    text = text.replace(\"'ve\", \" have\")\n    text = text.replace(\"n't\", \" not\")\n    text = text.replace(\"'re\", \" are\")\n    text = text.replace(\"'ll\", \" will\")\n\n    # remove stop words\n    new_text = []\n    stopwords = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]\n    for word in text.split():\n        if word  in stopwords:\n            new_text.append('')\n        else:\n            new_text.append(word)\n    x = new_text[:]\n    new_text.clear\n    text = \" \".join(x)\n\n    # remove punctuation\n    punct = string.punctuation\n\n    text = text.translate(str.maketrans('', '', punct))\n    \n    # remove numbers\n    digits = string.digits\n    text = text.translate(str.maketrans('', '', digits))\n        \n    # removing some characters\n    text = text.replace('’', ' ')\n\n    text = ' '.join(text.split())\n    \n    # stemming\n    ps = PorterStemmer()\n    \n    text = \" \".join([ps.stem(word) for word in text.split()])\n    \n    return text","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.365805Z","iopub.execute_input":"2024-08-12T08:44:22.366302Z","iopub.status.idle":"2024-08-12T08:44:22.401350Z","shell.execute_reply.started":"2024-08-12T08:44:22.366255Z","shell.execute_reply":"2024-08-12T08:44:22.399987Z"},"trusted":true},"execution_count":12,"outputs":[]},{"cell_type":"code","source":"preprocessing(\"’ s lightmatter photonic ambitions light AI up an $ 80M B round\")","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.402898Z","iopub.execute_input":"2024-08-12T08:44:22.403367Z","iopub.status.idle":"2024-08-12T08:44:22.425765Z","shell.execute_reply.started":"2024-08-12T08:44:22.403329Z","shell.execute_reply":"2024-08-12T08:44:22.424455Z"},"trusted":true},"execution_count":13,"outputs":[{"execution_count":13,"output_type":"execute_result","data":{"text/plain":"'lightmatt photon ambit light ai dollar m b round'"},"metadata":{}}]},{"cell_type":"code","source":"train_df['text'] = train_df['text'].apply(preprocessing)\ntest_df['text'] = test_df['text'].apply(preprocessing)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:44:22.427231Z","iopub.execute_input":"2024-08-12T08:44:22.427635Z","iopub.status.idle":"2024-08-12T08:47:08.687108Z","shell.execute_reply.started":"2024-08-12T08:44:22.427591Z","shell.execute_reply":"2024-08-12T08:47:08.686113Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"code","source":"print(train_df.shape)\nprint(test_df.shape)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.688493Z","iopub.execute_input":"2024-08-12T08:47:08.688873Z","iopub.status.idle":"2024-08-12T08:47:08.695313Z","shell.execute_reply.started":"2024-08-12T08:47:08.688840Z","shell.execute_reply":"2024-08-12T08:47:08.693855Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"(120097, 2)\n(44893, 2)\n","output_type":"stream"}]},{"cell_type":"code","source":"train_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.696761Z","iopub.execute_input":"2024-08-12T08:47:08.697338Z","iopub.status.idle":"2024-08-12T08:47:08.719350Z","shell.execute_reply.started":"2024-08-12T08:47:08.697291Z","shell.execute_reply":"2024-08-12T08:47:08.717950Z"},"trusted":true},"execution_count":16,"outputs":[{"execution_count":16,"output_type":"execute_result","data":{"text/plain":"                                                text   label\n0  leak oil refineri st croix biden give environm...  nature\n1             practic step build transpar busi remot  coding\n2                   convert imag runway video use ml      ml\n3               design principl visual direct weight  coding\n4        california perman enact protect tree joshua  nature","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>text</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>leak oil refineri st croix biden give environm...</td>\n      <td>nature</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>practic step build transpar busi remot</td>\n      <td>coding</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>convert imag runway video use ml</td>\n      <td>ml</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>design principl visual direct weight</td>\n      <td>coding</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>california perman enact protect tree joshua</td>\n      <td>nature</td>\n    </tr>\n  </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train_df.dropna(inplace=True)\ntest_df.dropna(inplace=True)\ntrain_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.720814Z","iopub.execute_input":"2024-08-12T08:47:08.721233Z","iopub.status.idle":"2024-08-12T08:47:08.795264Z","shell.execute_reply.started":"2024-08-12T08:47:08.721200Z","shell.execute_reply":"2024-08-12T08:47:08.794181Z"},"trusted":true},"execution_count":17,"outputs":[{"execution_count":17,"output_type":"execute_result","data":{"text/plain":"(120097, 2)"},"metadata":{}}]},{"cell_type":"markdown","source":"## **3. Preparing Dataset For Training**","metadata":{}},{"cell_type":"markdown","source":"### **3.1. Extracting Features From The Dataset**","metadata":{}},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer\n\ntfidf = TfidfVectorizer(min_df=8, ngram_range=(1, 3))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.796819Z","iopub.execute_input":"2024-08-12T08:47:08.797729Z","iopub.status.idle":"2024-08-12T08:47:08.803184Z","shell.execute_reply.started":"2024-08-12T08:47:08.797687Z","shell.execute_reply":"2024-08-12T08:47:08.801863Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"code","source":"# using tfidf to extract features from the dataset\ntrain_text_vector = tfidf.fit_transform(train_df['text']).toarray()\ntest_text_vector = tfidf.transform(test_df['text']).toarray()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:08.804636Z","iopub.execute_input":"2024-08-12T08:47:08.805074Z","iopub.status.idle":"2024-08-12T08:47:45.635014Z","shell.execute_reply.started":"2024-08-12T08:47:08.805043Z","shell.execute_reply":"2024-08-12T08:47:45.633422Z"},"trusted":true},"execution_count":19,"outputs":[]},{"cell_type":"code","source":"train_text_vector","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.636630Z","iopub.execute_input":"2024-08-12T08:47:45.637207Z","iopub.status.idle":"2024-08-12T08:47:45.647057Z","shell.execute_reply.started":"2024-08-12T08:47:45.637162Z","shell.execute_reply":"2024-08-12T08:47:45.645459Z"},"trusted":true},"execution_count":20,"outputs":[{"execution_count":20,"output_type":"execute_result","data":{"text/plain":"array([[0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       ...,\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 0., 0., 0.]])"},"metadata":{}}]},{"cell_type":"code","source":"# converting the data array into dataframe\ntrain_text_vector_df = pd.DataFrame(train_text_vector, index=train_df.index)\ntest_text_vector_df = pd.DataFrame(test_text_vector, index=test_df.index)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.648219Z","iopub.execute_input":"2024-08-12T08:47:45.648646Z","iopub.status.idle":"2024-08-12T08:47:45.670509Z","shell.execute_reply.started":"2024-08-12T08:47:45.648611Z","shell.execute_reply":"2024-08-12T08:47:45.669110Z"},"trusted":true},"execution_count":21,"outputs":[]},{"cell_type":"code","source":"X_train = train_text_vector_df\ny_train = train_df['label']","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.672440Z","iopub.execute_input":"2024-08-12T08:47:45.672947Z","iopub.status.idle":"2024-08-12T08:47:45.688735Z","shell.execute_reply.started":"2024-08-12T08:47:45.672911Z","shell.execute_reply":"2024-08-12T08:47:45.687204Z"},"trusted":true},"execution_count":22,"outputs":[]},{"cell_type":"code","source":"train_text_vector_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.690102Z","iopub.execute_input":"2024-08-12T08:47:45.690587Z","iopub.status.idle":"2024-08-12T08:47:45.708181Z","shell.execute_reply.started":"2024-08-12T08:47:45.690538Z","shell.execute_reply":"2024-08-12T08:47:45.706811Z"},"trusted":true},"execution_count":23,"outputs":[{"execution_count":23,"output_type":"execute_result","data":{"text/plain":"(120097, 52860)"},"metadata":{}}]},{"cell_type":"code","source":"test_text_vector_df.shape","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.709713Z","iopub.execute_input":"2024-08-12T08:47:45.710249Z","iopub.status.idle":"2024-08-12T08:47:45.726817Z","shell.execute_reply.started":"2024-08-12T08:47:45.710198Z","shell.execute_reply":"2024-08-12T08:47:45.724408Z"},"trusted":true},"execution_count":24,"outputs":[{"execution_count":24,"output_type":"execute_result","data":{"text/plain":"(44893, 52860)"},"metadata":{}}]},{"cell_type":"code","source":"X_test = test_text_vector_df\ny_test = test_df['label']","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.729221Z","iopub.execute_input":"2024-08-12T08:47:45.729901Z","iopub.status.idle":"2024-08-12T08:47:45.741643Z","shell.execute_reply.started":"2024-08-12T08:47:45.729863Z","shell.execute_reply":"2024-08-12T08:47:45.740067Z"},"trusted":true},"execution_count":25,"outputs":[]},{"cell_type":"code","source":"X_train.head()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.743355Z","iopub.execute_input":"2024-08-12T08:47:45.743749Z","iopub.status.idle":"2024-08-12T08:47:45.798489Z","shell.execute_reply.started":"2024-08-12T08:47:45.743715Z","shell.execute_reply":"2024-08-12T08:47:45.796890Z"},"trusted":true},"execution_count":26,"outputs":[{"execution_count":26,"output_type":"execute_result","data":{"text/plain":"   0      1      2      3      4      5      6      7      8      9      ...  \\\n0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   \n1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   \n2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   \n3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   \n4    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  ...   \n\n   52850  52851  52852  52853  52854  52855  52856  52857  52858  52859  \n0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  \n1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  \n2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  \n3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  \n4    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0  \n\n[5 rows x 52860 columns]","text/html":"<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n      <th>4</th>\n      <th>5</th>\n      <th>6</th>\n      <th>7</th>\n      <th>8</th>\n      <th>9</th>\n      <th>...</th>\n      <th>52850</th>\n      <th>52851</th>\n      <th>52852</th>\n      <th>52853</th>\n      <th>52854</th>\n      <th>52855</th>\n      <th>52856</th>\n      <th>52857</th>\n      <th>52858</th>\n      <th>52859</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 52860 columns</p>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"y_train.unique()","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.800281Z","iopub.execute_input":"2024-08-12T08:47:45.800776Z","iopub.status.idle":"2024-08-12T08:47:45.821025Z","shell.execute_reply.started":"2024-08-12T08:47:45.800731Z","shell.execute_reply":"2024-08-12T08:47:45.819503Z"},"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":"array(['nature', 'coding', 'ml', 'AI', 'business', 'world', 'sports',\n       'sci/tech'], dtype=object)"},"metadata":{}}]},{"cell_type":"markdown","source":"### **3.2. Encoding Labels**","metadata":{}},{"cell_type":"code","source":"from sklearn.preprocessing import LabelEncoder\n\n# Initialize the encoder\nlabel_encoder = LabelEncoder()\n\n# Fit and transform the labels\ny_train = label_encoder.fit_transform(y_train)\ny_test = label_encoder.fit_transform(y_test)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.827700Z","iopub.execute_input":"2024-08-12T08:47:45.828140Z","iopub.status.idle":"2024-08-12T08:47:45.882509Z","shell.execute_reply.started":"2024-08-12T08:47:45.828105Z","shell.execute_reply":"2024-08-12T08:47:45.881298Z"},"trusted":true},"execution_count":28,"outputs":[]},{"cell_type":"code","source":"y_train","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.884175Z","iopub.execute_input":"2024-08-12T08:47:45.884634Z","iopub.status.idle":"2024-08-12T08:47:45.892209Z","shell.execute_reply.started":"2024-08-12T08:47:45.884580Z","shell.execute_reply":"2024-08-12T08:47:45.891081Z"},"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":"array([4, 2, 3, ..., 6, 5, 7])"},"metadata":{}}]},{"cell_type":"markdown","source":"## **4. Model Training**","metadata":{}},{"cell_type":"markdown","source":"### **4.1. Naive Bayes**","metadata":{}},{"cell_type":"code","source":"from sklearn.naive_bayes import MultinomialNB, BernoulliNB","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.893669Z","iopub.execute_input":"2024-08-12T08:47:45.894086Z","iopub.status.idle":"2024-08-12T08:47:45.912017Z","shell.execute_reply.started":"2024-08-12T08:47:45.894027Z","shell.execute_reply":"2024-08-12T08:47:45.910827Z"},"trusted":true},"execution_count":30,"outputs":[]},{"cell_type":"code","source":"mnb_classifier = MultinomialNB()\nmnb_classifier.fit(X_train, y_train)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:47:45.914105Z","iopub.execute_input":"2024-08-12T08:47:45.914605Z","iopub.status.idle":"2024-08-12T08:48:13.583967Z","shell.execute_reply.started":"2024-08-12T08:47:45.914559Z","shell.execute_reply":"2024-08-12T08:48:13.582439Z"},"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":"MultinomialNB()","text/html":"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MultinomialNB</label><div class=\"sk-toggleable__content\"><pre>MultinomialNB()</pre></div></div></div></div></div>"},"metadata":{}}]},{"cell_type":"markdown","source":"## **5. Evaluate the models**","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score, classification_report\nmnb_predictions = mnb_classifier.predict(X_test)\nmnb_accuracy = accuracy_score(y_test, mnb_predictions)\nprint(\"Multinomial Naïve Bayes Accuracy:\", mnb_accuracy)\nprint(\"Classification Report:\")\nprint(classification_report(y_test, mnb_predictions))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:13.586179Z","iopub.execute_input":"2024-08-12T08:48:13.594164Z","iopub.status.idle":"2024-08-12T08:48:23.832059Z","shell.execute_reply.started":"2024-08-12T08:48:13.594092Z","shell.execute_reply":"2024-08-12T08:48:23.830782Z"},"trusted":true},"execution_count":32,"outputs":[{"name":"stdout","text":"Multinomial Naïve Bayes Accuracy: 0.8872430000222752\nClassification Report:\n              precision    recall  f1-score   support\n\n           0       0.94      0.74      0.83      1567\n           1       0.87      0.87      0.87      9601\n           2       0.93      0.77      0.84      1932\n           3       0.92      0.87      0.90      1404\n           4       0.90      0.70      0.78      1593\n           5       0.81      0.88      0.85      9564\n           6       0.93      0.98      0.96      9561\n           7       0.92      0.90      0.91      9671\n\n    accuracy                           0.89     44893\n   macro avg       0.90      0.84      0.87     44893\nweighted avg       0.89      0.89      0.89     44893\n\n","output_type":"stream"}]},{"cell_type":"code","source":"# testing the best model\ntext = \"US jobs growth in June beats expectations\"\n\nprepro_text = preprocessing(text)\nvec_text = tfidf.transform([prepro_text])\nresult = mnb_classifier.predict(vec_text)\nprint(label_encoder.inverse_transform(result))","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:23.833408Z","iopub.execute_input":"2024-08-12T08:48:23.833758Z","iopub.status.idle":"2024-08-12T08:48:23.845561Z","shell.execute_reply.started":"2024-08-12T08:48:23.833725Z","shell.execute_reply":"2024-08-12T08:48:23.844142Z"},"trusted":true},"execution_count":33,"outputs":[{"name":"stdout","text":"['business']\n","output_type":"stream"}]},{"cell_type":"code","source":"# saving the best model\nimport joblib\n\njoblib.dump(mnb_classifier, 'mnb_classifier.joblib')\njoblib.dump(tfidf, 'tfidf_vectorizer.joblib')\njoblib.dump(label_encoder, 'label_encoder.joblib')","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:23.846861Z","iopub.execute_input":"2024-08-12T08:48:23.847303Z","iopub.status.idle":"2024-08-12T08:48:46.927900Z","shell.execute_reply.started":"2024-08-12T08:48:23.847270Z","shell.execute_reply":"2024-08-12T08:48:46.926786Z"},"trusted":true},"execution_count":34,"outputs":[{"execution_count":34,"output_type":"execute_result","data":{"text/plain":"['label_encoder.joblib']"},"metadata":{}}]},{"cell_type":"markdown","source":"### **4.2. Random Forest**","metadata":{"execution":{"iopub.status.busy":"2024-07-06T08:14:01.195834Z","iopub.execute_input":"2024-07-06T08:14:01.196660Z","iopub.status.idle":"2024-07-06T08:14:01.201176Z","shell.execute_reply.started":"2024-07-06T08:14:01.196624Z","shell.execute_reply":"2024-07-06T08:14:01.199962Z"}}},{"cell_type":"code","source":"from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import accuracy_score\nrf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=42)\nrf.fit(X_train, y_train)\ny_pred = rf.predict(X_test)\naccuracy_score(y_test,y_pred)","metadata":{"execution":{"iopub.status.busy":"2024-08-12T08:48:46.929334Z","iopub.execute_input":"2024-08-12T08:48:46.929685Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from xgboost import XGBClassifier\n\nxgb_model = XGBClassifier().fit(X_train, y_train)\n\n# predict\nxgb_y_predict = xgb_model.predict(X_test)\n\n# accuracy score\nxgb_score = accuracy_score(xgb_y_predict, y_test)\n\nprint('Accuracy score is:', xgb_score)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}