{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "06391735-12f3-45dd-988f-28559ca176f9", "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 44, "id": "a0544a40-d98a-4b01-84dd-f74c8c373749", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(98000, 30)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('./data.csv')\n", "# sub = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2022/sample_submission.csv')\n", "data.shape" ] }, { "cell_type": "code", "execution_count": 45, "id": "ba3107bc-49c1-4aff-a86d-1b0da2e9f84a", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn import cluster \n", "from sklearn.impute import SimpleImputer\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "from yellowbrick.cluster import KElbowVisualizer\n", "from sklearn.preprocessing import QuantileTransformer\n", "from sklearn.metrics import silhouette_score, silhouette_samples\n", "\n", "\n", "X = data.drop(columns = ['id'])" ] }, { "cell_type": "code", "execution_count": null, "id": "24040c75-4a16-4d62-9b78-6a5af14fe560", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m[I 2022-07-03 16:47:32,023]\u001b[0m A new study created in memory with name: tonne\u001b[0m\n" ] } ], "source": [ "import optuna\n", "pipe = Pipeline(steps = [\n", " ('imputer', SimpleImputer()),\n", " ('preprocessing', QuantileTransformer()),\n", " ('reduce_dimenstion', PCA(n_components = 10)),\n", " ('estimator', cluster.KMeans(n_clusters = 6))\n", "])\n", "\n", "rds = {\n", " 'pca': PCA(),\n", " 'tsne': TSNE()\n", "}\n", "estimators = {\n", " 'kmeans': cluster.KMeans(),\n", " 'agg': cluster.AgglomerativeClustering()\n", "}\n", "def objective(trial):\n", " params = {}\n", " \n", " rd_name = trial.suggest_categorical('rd', ['pca'])\n", " params['reduce_dimenstion'] = rds[rd_name]\n", " n_components = trial.suggest_int('n_components', 7, 20)\n", " params['reduce_dimenstion__n_components'] = n_components\n", " \n", " estimator_name = trial.suggest_categorical('cluster', ['kmeans'])\n", " params['estimator'] = estimators[estimator_name]\n", " \n", " if estimator_name in ['kmeans', 'agg']:\n", " n_clusters = trial.suggest_int('n_clusters', 4, 12)\n", " params['estimator__n_clusters'] = n_clusters\n", " \n", " pipe.set_params(**params)\n", " pipe.fit(X)\n", " y_pred = pipe.predict(X)\n", " sh_score = silhouette_score(pipe[:-1].transform(X), y_pred)\n", " return sh_score\n", " \n", "study = optuna.create_study(study_name = 'tonne',\n", " direction='maximize')\n", "study.optimize(objective, \n", " n_trials=10,\n", " timeout = 5000,\n", " # n_jobs = 2,\n", " )" ] }, { "cell_type": "code", "execution_count": 23, "id": "f556b049-5d9f-4c1b-9d7f-d0f176e9dece", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'rd': 'pca', 'n_components': 7, 'cluster': 'kmeans', 'n_clusters': 11}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_params = study.best_params\n", "best_params" ] }, { "cell_type": "code", "execution_count": 29, "id": "8c1a1766-2cca-41e4-83a7-749a7d8a8d47", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Id | \n", "Predicted | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "6 | \n", "
| 1 | \n", "1 | \n", "0 | \n", "
| 2 | \n", "2 | \n", "2 | \n", "
| 3 | \n", "3 | \n", "1 | \n", "
| 4 | \n", "4 | \n", "5 | \n", "