{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "29834325", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-06-28T14:29:11.557719Z", "iopub.status.busy": "2023-06-28T14:29:11.557247Z", "iopub.status.idle": "2023-06-28T14:29:11.571599Z", "shell.execute_reply": "2023-06-28T14:29:11.570549Z" }, "papermill": { "duration": 0.026028, "end_time": "2023-06-28T14:29:11.574556", "exception": false, "start_time": "2023-06-28T14:29:11.548528", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " pass\n", "# print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "code", "execution_count": 2, "id": "68b4799b", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:11.586208Z", "iopub.status.busy": "2023-06-28T14:29:11.585762Z", "iopub.status.idle": "2023-06-28T14:29:13.734524Z", "shell.execute_reply": "2023-06-28T14:29:13.732965Z" }, "papermill": { "duration": 2.158201, "end_time": "2023-06-28T14:29:13.737697", "exception": false, "start_time": "2023-06-28T14:29:11.579496", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.impute import SimpleImputer" ] }, { "cell_type": "code", "execution_count": 3, "id": "dd1aa6d5", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:13.749104Z", "iopub.status.busy": "2023-06-28T14:29:13.748590Z", "iopub.status.idle": "2023-06-28T14:29:13.805019Z", "shell.execute_reply": "2023-06-28T14:29:13.803969Z" }, "papermill": { "duration": 0.06561, "end_time": "2023-06-28T14:29:13.807921", "exception": false, "start_time": "2023-06-28T14:29:13.742311", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Open file with pd.read_csv\n", "df_train = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/train.csv\")\n", "df_test = pd.read_csv(\"/kaggle/input/icr-identify-age-related-conditions/test.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "563c47ff", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:13.819160Z", "iopub.status.busy": "2023-06-28T14:29:13.818727Z", "iopub.status.idle": "2023-06-28T14:29:13.839746Z", "shell.execute_reply": "2023-06-28T14:29:13.838298Z" }, "papermill": { "duration": 0.030103, "end_time": "2023-06-28T14:29:13.843061", "exception": false, "start_time": "2023-06-28T14:29:13.812958", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively\n", "df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})\n", "df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})" ] }, { "cell_type": "code", "execution_count": 5, "id": "af9245ad", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:13.853869Z", "iopub.status.busy": "2023-06-28T14:29:13.853426Z", "iopub.status.idle": "2023-06-28T14:29:13.867982Z", "shell.execute_reply": "2023-06-28T14:29:13.866486Z" }, "papermill": { "duration": 0.022904, "end_time": "2023-06-28T14:29:13.870386", "exception": false, "start_time": "2023-06-28T14:29:13.847482", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Split the training data into features (X) and target variable (y)\n", "X_train = df_train.drop([\"Class\", \"Id\"], axis=1) # Exclude non-numeric columns\n", "y_train = df_train[\"Class\"]\n", "\n", "# Split the test data into features (X_test)\n", "X_test = df_test.drop(\"Id\", axis=1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "48963e25", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:13.881371Z", "iopub.status.busy": "2023-06-28T14:29:13.880917Z", "iopub.status.idle": "2023-06-28T14:29:13.900968Z", "shell.execute_reply": "2023-06-28T14:29:13.899934Z" }, "papermill": { "duration": 0.029018, "end_time": "2023-06-28T14:29:13.903834", "exception": false, "start_time": "2023-06-28T14:29:13.874816", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Identify columns with missing values\n", "columns_with_missing = X_train.columns[X_train.isna().any()].tolist()\n", "\n", "# Impute missing values with the mean of each column\n", "imputer = SimpleImputer(strategy='mean')\n", "X_train_imputed = imputer.fit_transform(X_train)\n", "X_test_imputed = imputer.transform(X_test)\n", "\n", "# Scale the features using StandardScaler\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train_imputed)\n", "X_test_scaled = scaler.transform(X_test_imputed)" ] }, { "cell_type": "code", "execution_count": 7, "id": "7c337184", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:13.915609Z", "iopub.status.busy": "2023-06-28T14:29:13.914400Z", "iopub.status.idle": "2023-06-28T14:29:14.392939Z", "shell.execute_reply": "2023-06-28T14:29:14.391879Z" }, "papermill": { "duration": 0.487453, "end_time": "2023-06-28T14:29:14.395785", "exception": false, "start_time": "2023-06-28T14:29:13.908332", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Get feature importances\n", "rfc = RandomForestClassifier()\n", "rfc.fit(X_train_scaled, y_train)\n", "feature_importances = rfc.feature_importances_\n", "\n", "# Create a DataFrame for feature importance\n", "importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})\n", "\n", "# Sort the features by importance (descending order)\n", "importance_df = importance_df.sort_values(by='Importance', ascending=False)" ] }, { "cell_type": "code", "execution_count": 8, "id": "ce5fddae", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:14.406819Z", "iopub.status.busy": "2023-06-28T14:29:14.406345Z", "iopub.status.idle": "2023-06-28T14:29:14.413437Z", "shell.execute_reply": "2023-06-28T14:29:14.412266Z" }, "papermill": { "duration": 0.015929, "end_time": "2023-06-28T14:29:14.416226", "exception": false, "start_time": "2023-06-28T14:29:14.400297", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Select the top important variables\n", "num_variables = 10 # Specify the number of top important variables to use\n", "important_variables = importance_df['Feature'].tolist()[:num_variables]\n", "X_train_important = X_train_scaled[:, importance_df.index[:num_variables]]\n", "X_test_important = X_test_scaled[:, importance_df.index[:num_variables]]" ] }, { "cell_type": "code", "execution_count": 9, "id": "4e746beb", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:14.427650Z", "iopub.status.busy": "2023-06-28T14:29:14.427116Z", "iopub.status.idle": "2023-06-28T14:29:14.756684Z", "shell.execute_reply": "2023-06-28T14:29:14.755491Z" }, "papermill": { "duration": 0.338831, "end_time": "2023-06-28T14:29:14.759951", "exception": false, "start_time": "2023-06-28T14:29:14.421120", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Train the random forest model using only the important variables\n", "rfc_important = RandomForestClassifier()\n", "rfc_important.fit(X_train_important, y_train)\n", "\n", "# Predict on the test set using only the important variables\n", "rfc_pred = rfc_important.predict(X_test_important)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "13cf4b5b", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T14:29:14.771894Z", "iopub.status.busy": "2023-06-28T14:29:14.771075Z", "iopub.status.idle": "2023-06-28T14:29:14.796398Z", "shell.execute_reply": "2023-06-28T14:29:14.795487Z" }, "papermill": { "duration": 0.034975, "end_time": "2023-06-28T14:29:14.799451", "exception": false, "start_time": "2023-06-28T14:29:14.764476", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Predict probabilities for each class in the test set\n", "rfc_pred_proba = rfc.predict_proba(X_test_scaled)\n", "\n", "# Create a DataFrame to store the predictions\n", "predictions_df = pd.DataFrame({'Id': df_test['Id'],\n", " 'class_0': rfc_pred_proba[:, 0],\n", " 'class_1': rfc_pred_proba[:, 1]})\n", "\n", "# Save the predictions to a CSV file\n", "predictions_df.to_csv('submission.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "papermill": { "default_parameters": {}, "duration": 18.313658, "end_time": "2023-06-28T14:29:16.232503", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2023-06-28T14:28:57.918845", "version": "2.4.0" } }, "nbformat": 4, "nbformat_minor": 5 }