{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Init" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import joblib\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_parquet('train_raw.parquet')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train = train[train.primary_use == 'Education']\n", "train.drop(columns='primary_use', inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prep Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | building_id | \n", "timestamp | \n", "meter_reading | \n", "site_id | \n", "square_feet | \n", "year_built | \n", "floor_count | \n", "air_temperature | \n", "cloud_coverage | \n", "dew_temperature | \n", "precip_depth_1_hr | \n", "sea_level_pressure | \n", "wind_direction | \n", "wind_speed | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "4 | \n", "2016-01-01 | \n", "0.0 | \n", "0 | \n", "116607 | \n", "1975.0 | \n", "NaN | \n", "25.0 | \n", "6.0 | \n", "20.0 | \n", "NaN | \n", "1019.7 | \n", "0.0 | \n", "0.0 | \n", "
| 1 | \n", "5 | \n", "2016-01-01 | \n", "0.0 | \n", "0 | \n", "8000 | \n", "2000.0 | \n", "NaN | \n", "25.0 | \n", "6.0 | \n", "20.0 | \n", "NaN | \n", "1019.7 | \n", "0.0 | \n", "0.0 | \n", "
| 3 | \n", "7 | \n", "2016-01-01 | \n", "0.0 | \n", "0 | \n", "121074 | \n", "1989.0 | \n", "NaN | \n", "25.0 | \n", "6.0 | \n", "20.0 | \n", "NaN | \n", "1019.7 | \n", "0.0 | \n", "0.0 | \n", "
| 4 | \n", "123 | \n", "2016-01-01 | \n", "46.4 | \n", "1 | \n", "61204 | \n", "1989.0 | \n", "6.0 | \n", "3.8 | \n", "NaN | \n", "2.4 | \n", "NaN | \n", "1020.9 | \n", "240.0 | \n", "3.1 | \n", "
| 5 | \n", "124 | \n", "2016-01-01 | \n", "9.3 | \n", "1 | \n", "38319 | \n", "1900.0 | \n", "6.0 | \n", "3.8 | \n", "NaN | \n", "2.4 | \n", "NaN | \n", "1020.9 | \n", "240.0 | \n", "3.1 | \n", "
| \n", " | meter_reading | \n", "square_feet | \n", "air_temperature | \n", "precip_depth_1_hr | \n", "wind_speed | \n", "month | \n", "weekday | \n", "hour | \n", "age | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 20 | \n", "0.00 | \n", "116607 | \n", "24.4 | \n", "-1.0 | \n", "1.5 | \n", "January | \n", "Friday | \n", "1 | \n", "40.0 | \n", "
| 21 | \n", "0.00 | \n", "8000 | \n", "24.4 | \n", "-1.0 | \n", "1.5 | \n", "January | \n", "Friday | \n", "1 | \n", "15.0 | \n", "
| 23 | \n", "0.00 | \n", "121074 | \n", "24.4 | \n", "-1.0 | \n", "1.5 | \n", "January | \n", "Friday | \n", "1 | \n", "26.0 | \n", "
| 28 | \n", "381.71 | \n", "204349 | \n", "13.9 | \n", "0.0 | \n", "4.1 | \n", "January | \n", "Friday | \n", "1 | \n", "22.0 | \n", "
| 31 | \n", "353.20 | \n", "272278 | \n", "13.9 | \n", "0.0 | \n", "4.1 | \n", "January | \n", "Friday | \n", "1 | \n", "60.0 | \n", "
| \n", " | meter_reading | \n", "square_feet | \n", "air_temperature | \n", "precip_depth_1_hr | \n", "wind_speed | \n", "hour | \n", "age | \n", "
|---|---|---|---|---|---|---|---|
| count | \n", "52218.000000 | \n", "52218.000000 | \n", "52218.000000 | \n", "52218.000000 | \n", "52218.000000 | \n", "52218.000000 | \n", "52218.000000 | \n", "
| mean | \n", "300.266741 | \n", "112478.963806 | \n", "21.740814 | \n", "0.954307 | \n", "3.479120 | \n", "11.504845 | \n", "43.837278 | \n", "
| std | \n", "409.992228 | \n", "82319.508267 | \n", "9.626215 | \n", "9.655634 | \n", "2.174506 | \n", "6.923051 | \n", "33.697352 | \n", "
| min | \n", "0.000000 | \n", "8000.000000 | \n", "-10.600000 | \n", "-1.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
| 25% | \n", "28.326200 | \n", "40950.000000 | \n", "15.600000 | \n", "0.000000 | \n", "2.100000 | \n", "6.000000 | \n", "15.000000 | \n", "
| 50% | \n", "120.730000 | \n", "116607.000000 | \n", "23.300000 | \n", "0.000000 | \n", "3.100000 | \n", "12.000000 | \n", "26.000000 | \n", "
| 75% | \n", "466.863250 | \n", "121074.000000 | \n", "28.300000 | \n", "0.000000 | \n", "4.600000 | \n", "18.000000 | \n", "60.000000 | \n", "
| max | \n", "3592.000000 | \n", "272278.000000 | \n", "47.200000 | \n", "343.000000 | \n", "17.000000 | \n", "23.000000 | \n", "105.000000 | \n", "
Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['square_feet',\n",
" 'air_temperature',\n",
" 'precip_depth_1_hr',\n",
" 'wind_speed', 'age']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" ['month', 'weekday',\n",
" 'hour'])])),\n",
" ('model', RandomForestRegressor(n_jobs=-1, random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['square_feet',\n",
" 'air_temperature',\n",
" 'precip_depth_1_hr',\n",
" 'wind_speed', 'age']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" ['month', 'weekday',\n",
" 'hour'])])),\n",
" ('model', RandomForestRegressor(n_jobs=-1, random_state=42))])ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler', StandardScaler())]),\n",
" ['square_feet', 'air_temperature',\n",
" 'precip_depth_1_hr', 'wind_speed', 'age']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" ['month', 'weekday', 'hour'])])['square_feet', 'air_temperature', 'precip_depth_1_hr', 'wind_speed', 'age']
StandardScaler()
['month', 'weekday', 'hour']
OneHotEncoder(handle_unknown='ignore')
RandomForestRegressor(n_jobs=-1, random_state=42)
| \n", " | square_feet | \n", "air_temperature | \n", "precip_depth_1_hr | \n", "wind_speed | \n", "month | \n", "weekday | \n", "hour | \n", "age | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "272278 | \n", "13.9 | \n", "0.0 | \n", "4.1 | \n", "January | \n", "Friday | \n", "1 | \n", "60 | \n", "