In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
 for filename in filenames:
 pass
# print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
# Open file with pd.read_csv
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")

In [4]:
# Convert 'A' and 'B' values in 'EJ' column to 0 and 1 respectively
df_train['EJ'] = df_train['EJ'].map({'A': 0, 'B': 1})
df_test['EJ'] = df_test['EJ'].map({'A': 0, 'B': 1})

In [5]:
# Split the training data into features (X) and target variable (y)
X_train = df_train.drop(["Class", "Id"], axis=1) # Exclude non-numeric columns
y_train = df_train["Class"]

# Split the test data into features (X_test)
X_test = df_test.drop("Id", axis=1)

In [6]:
# Identify columns with missing values
columns_with_missing = X_train.columns[X_train.isna().any()].tolist()

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [7]:
# Get feature importances
rfc = RandomForestClassifier()
rfc.fit(X_train_scaled, y_train)
feature_importances = rfc.feature_importances_

# Create a DataFrame for feature importance
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the features by importance (descending order)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [8]:
# Select the top important variables
num_variables = 10 # Specify the number of top important variables to use
important_variables = importance_df['Feature'].tolist()[:num_variables]
X_train_important = X_train_scaled[:, importance_df.index[:num_variables]]
X_test_important = X_test_scaled[:, importance_df.index[:num_variables]]

In [9]:
# Train the random forest model using only the important variables
rfc_important = RandomForestClassifier()
rfc_important.fit(X_train_important, y_train)

# Predict on the test set using only the important variables
rfc_pred = rfc_important.predict(X_test_important)


In [10]:
# Predict probabilities for each class in the test set
rfc_pred_proba = rfc.predict_proba(X_test_scaled)

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({'Id': df_test['Id'],
 'class_0': rfc_pred_proba[:, 0],
 'class_1': rfc_pred_proba[:, 1]})

# Save the predictions to a CSV file
predictions_df.to_csv('submission.csv', index=False)