File size: 11,799 Bytes

aa79d7c

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "29834325",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:11.557719Z",
     "iopub.status.busy": "2023-06-28T14:29:11.557247Z",
     "iopub.status.idle": "2023-06-28T14:29:11.571599Z",
     "shell.execute_reply": "2023-06-28T14:29:11.570549Z"
    },
    "papermill": {
     "duration": 0.026028,
     "end_time": "2023-06-28T14:29:11.574556",
     "exception": false,
     "start_time": "2023-06-28T14:29:11.548528",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load\n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the read-only \"../input/\" directory\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        pass\n",
    "#         print(os.path.join(dirname, filename))\n",
    "\n",
    "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
    "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "68b4799b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:11.586208Z",
     "iopub.status.busy": "2023-06-28T14:29:11.585762Z",
     "iopub.status.idle": "2023-06-28T14:29:13.734524Z",
     "shell.execute_reply": "2023-06-28T14:29:13.732965Z"
    },
    "papermill": {
     "duration": 2.158201,
     "end_time": "2023-06-28T14:29:13.737697",
     "exception": false,
     "start_time": "2023-06-28T14:29:11.579496",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.impute import SimpleImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dd1aa6d5",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:13.749104Z",
     "iopub.status.busy": "2023-06-28T14:29:13.748590Z",
     "iopub.status.idle": "2023-06-28T14:29:13.805019Z",
     "shell.execute_reply": "2023-06-28T14:29:13.803969Z"
    },
    "papermill": {
     "duration": 0.06561,
     "end_time": "2023-06-28T14:29:13.807921",
     "exception": false,
     "start_time": "2023-06-28T14:29:13.742311",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Open file with pd.read_csv\n",
    "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n",
    "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "563c47ff",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:13.819160Z",
     "iopub.status.busy": "2023-06-28T14:29:13.818727Z",
     "iopub.status.idle": "2023-06-28T14:29:13.839746Z",
     "shell.execute_reply": "2023-06-28T14:29:13.838298Z"
    },
    "papermill": {
     "duration": 0.030103,
     "end_time": "2023-06-28T14:29:13.843061",
     "exception": false,
     "start_time": "2023-06-28T14:29:13.812958",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n",
    "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n",
    "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "af9245ad",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:13.853869Z",
     "iopub.status.busy": "2023-06-28T14:29:13.853426Z",
     "iopub.status.idle": "2023-06-28T14:29:13.867982Z",
     "shell.execute_reply": "2023-06-28T14:29:13.866486Z"
    },
    "papermill": {
     "duration": 0.022904,
     "end_time": "2023-06-28T14:29:13.870386",
     "exception": false,
     "start_time": "2023-06-28T14:29:13.847482",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Split the training data into features (X) and target variable (y)\n",
    "X_train = df_train.drop([\"Class\", \"Id\"], axis=1)  # Exclude non-numeric columns\n",
    "y_train = df_train[\"Class\"]\n",
    "\n",
    "# Split the test data into features (X_test)\n",
    "X_test = df_test.drop(\"Id\", axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "48963e25",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:13.881371Z",
     "iopub.status.busy": "2023-06-28T14:29:13.880917Z",
     "iopub.status.idle": "2023-06-28T14:29:13.900968Z",
     "shell.execute_reply": "2023-06-28T14:29:13.899934Z"
    },
    "papermill": {
     "duration": 0.029018,
     "end_time": "2023-06-28T14:29:13.903834",
     "exception": false,
     "start_time": "2023-06-28T14:29:13.874816",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Identify columns with missing values\n",
    "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n",
    "\n",
    "# Impute missing values with the mean of each column\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "X_train_imputed = imputer.fit_transform(X_train)\n",
    "X_test_imputed = imputer.transform(X_test)\n",
    "\n",
    "# Scale the features using StandardScaler\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train_imputed)\n",
    "X_test_scaled = scaler.transform(X_test_imputed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7c337184",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:13.915609Z",
     "iopub.status.busy": "2023-06-28T14:29:13.914400Z",
     "iopub.status.idle": "2023-06-28T14:29:14.392939Z",
     "shell.execute_reply": "2023-06-28T14:29:14.391879Z"
    },
    "papermill": {
     "duration": 0.487453,
     "end_time": "2023-06-28T14:29:14.395785",
     "exception": false,
     "start_time": "2023-06-28T14:29:13.908332",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Get feature importances\n",
    "rfc = RandomForestClassifier()\n",
    "rfc.fit(X_train_scaled, y_train)\n",
    "feature_importances = rfc.feature_importances_\n",
    "\n",
    "# Create a DataFrame for feature importance\n",
    "importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})\n",
    "\n",
    "# Sort the features by importance (descending order)\n",
    "importance_df = importance_df.sort_values(by='Importance', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ce5fddae",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:14.406819Z",
     "iopub.status.busy": "2023-06-28T14:29:14.406345Z",
     "iopub.status.idle": "2023-06-28T14:29:14.413437Z",
     "shell.execute_reply": "2023-06-28T14:29:14.412266Z"
    },
    "papermill": {
     "duration": 0.015929,
     "end_time": "2023-06-28T14:29:14.416226",
     "exception": false,
     "start_time": "2023-06-28T14:29:14.400297",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Select the top important variables\n",
    "num_variables = 10  # Specify the number of top important variables to use\n",
    "important_variables = importance_df['Feature'].tolist()[:num_variables]\n",
    "X_train_important = X_train_scaled[:, importance_df.index[:num_variables]]\n",
    "X_test_important = X_test_scaled[:, importance_df.index[:num_variables]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4e746beb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:14.427650Z",
     "iopub.status.busy": "2023-06-28T14:29:14.427116Z",
     "iopub.status.idle": "2023-06-28T14:29:14.756684Z",
     "shell.execute_reply": "2023-06-28T14:29:14.755491Z"
    },
    "papermill": {
     "duration": 0.338831,
     "end_time": "2023-06-28T14:29:14.759951",
     "exception": false,
     "start_time": "2023-06-28T14:29:14.421120",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Train the random forest model using only the important variables\n",
    "rfc_important = RandomForestClassifier()\n",
    "rfc_important.fit(X_train_important, y_train)\n",
    "\n",
    "# Predict on the test set using only the important variables\n",
    "rfc_pred = rfc_important.predict(X_test_important)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "13cf4b5b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-06-28T14:29:14.771894Z",
     "iopub.status.busy": "2023-06-28T14:29:14.771075Z",
     "iopub.status.idle": "2023-06-28T14:29:14.796398Z",
     "shell.execute_reply": "2023-06-28T14:29:14.795487Z"
    },
    "papermill": {
     "duration": 0.034975,
     "end_time": "2023-06-28T14:29:14.799451",
     "exception": false,
     "start_time": "2023-06-28T14:29:14.764476",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Predict probabilities for each class in the test set\n",
    "rfc_pred_proba = rfc.predict_proba(X_test_scaled)\n",
    "\n",
    "# Create a DataFrame to store the predictions\n",
    "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n",
    "                               'class_0': rfc_pred_proba[:, 0],\n",
    "                               'class_1': rfc_pred_proba[:, 1]})\n",
    "\n",
    "# Save the predictions to a CSV file\n",
    "predictions_df.to_csv('submission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 18.313658,
   "end_time": "2023-06-28T14:29:16.232503",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2023-06-28T14:28:57.918845",
   "version": "2.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}