{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Init" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import joblib\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train = pd.read_parquet('train_raw.parquet')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train = train[train.primary_use == 'Education']\n", "train.drop(columns='primary_use', inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prep Data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
building_idtimestampmeter_readingsite_idsquare_feetyear_builtfloor_countair_temperaturecloud_coveragedew_temperatureprecip_depth_1_hrsea_level_pressurewind_directionwind_speed
042016-01-010.001166071975.0NaN25.06.020.0NaN1019.70.00.0
152016-01-010.0080002000.0NaN25.06.020.0NaN1019.70.00.0
372016-01-010.001210741989.0NaN25.06.020.0NaN1019.70.00.0
41232016-01-0146.41612041989.06.03.8NaN2.4NaN1020.9240.03.1
51242016-01-019.31383191900.06.03.8NaN2.4NaN1020.9240.03.1
\n", "
" ], "text/plain": [ " building_id timestamp meter_reading site_id square_feet year_built \\\n", "0 4 2016-01-01 0.0 0 116607 1975.0 \n", "1 5 2016-01-01 0.0 0 8000 2000.0 \n", "3 7 2016-01-01 0.0 0 121074 1989.0 \n", "4 123 2016-01-01 46.4 1 61204 1989.0 \n", "5 124 2016-01-01 9.3 1 38319 1900.0 \n", "\n", " floor_count air_temperature cloud_coverage dew_temperature \\\n", "0 NaN 25.0 6.0 20.0 \n", "1 NaN 25.0 6.0 20.0 \n", "3 NaN 25.0 6.0 20.0 \n", "4 6.0 3.8 NaN 2.4 \n", "5 6.0 3.8 NaN 2.4 \n", "\n", " precip_depth_1_hr sea_level_pressure wind_direction wind_speed \n", "0 NaN 1019.7 0.0 0.0 \n", "1 NaN 1019.7 0.0 0.0 \n", "3 NaN 1019.7 0.0 0.0 \n", "4 NaN 1020.9 240.0 3.1 \n", "5 NaN 1020.9 240.0 3.1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Drop building_id, site_id, dew_temperature, sea_level_pressure, wind_direction\n", "train.drop(columns=['building_id', 'site_id', 'floor_count', 'cloud_coverage', 'dew_temperature', 'sea_level_pressure', 'wind_direction'], inplace=True)\n", "\n", "# Drop all rows with missing values\n", "train.dropna(inplace=True)\n", "\n", "# Extract month, weekday, and hour of the day from timestamp\n", "train['timestamp'] = pd.to_datetime(train['timestamp'])\n", "train['month'] = train['timestamp'].dt.month.map({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})\n", "train['weekday'] = train['timestamp'].dt.weekday.map({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'})\n", "train['hour'] = train['timestamp'].dt.hour\n", "train.drop(columns=['timestamp'], inplace=True)\n", "\n", "# Calculate age of building\n", "train['age'] = train['year_built'].max() - train['year_built']\n", "train.drop(columns=['year_built'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(52218, 9)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meter_readingsquare_feetair_temperatureprecip_depth_1_hrwind_speedmonthweekdayhourage
200.0011660724.4-1.01.5JanuaryFriday140.0
210.00800024.4-1.01.5JanuaryFriday115.0
230.0012107424.4-1.01.5JanuaryFriday126.0
28381.7120434913.90.04.1JanuaryFriday122.0
31353.2027227813.90.04.1JanuaryFriday160.0
\n", "
" ], "text/plain": [ " meter_reading square_feet air_temperature precip_depth_1_hr \\\n", "20 0.00 116607 24.4 -1.0 \n", "21 0.00 8000 24.4 -1.0 \n", "23 0.00 121074 24.4 -1.0 \n", "28 381.71 204349 13.9 0.0 \n", "31 353.20 272278 13.9 0.0 \n", "\n", " wind_speed month weekday hour age \n", "20 1.5 January Friday 1 40.0 \n", "21 1.5 January Friday 1 15.0 \n", "23 1.5 January Friday 1 26.0 \n", "28 4.1 January Friday 1 22.0 \n", "31 4.1 January Friday 1 60.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meter_readingsquare_feetair_temperatureprecip_depth_1_hrwind_speedhourage
count52218.00000052218.00000052218.00000052218.00000052218.00000052218.00000052218.000000
mean300.266741112478.96380621.7408140.9543073.47912011.50484543.837278
std409.99222882319.5082679.6262159.6556342.1745066.92305133.697352
min0.0000008000.000000-10.600000-1.0000000.0000000.0000000.000000
25%28.32620040950.00000015.6000000.0000002.1000006.00000015.000000
50%120.730000116607.00000023.3000000.0000003.10000012.00000026.000000
75%466.863250121074.00000028.3000000.0000004.60000018.00000060.000000
max3592.000000272278.00000047.200000343.00000017.00000023.000000105.000000
\n", "
" ], "text/plain": [ " meter_reading square_feet air_temperature precip_depth_1_hr \\\n", "count 52218.000000 52218.000000 52218.000000 52218.000000 \n", "mean 300.266741 112478.963806 21.740814 0.954307 \n", "std 409.992228 82319.508267 9.626215 9.655634 \n", "min 0.000000 8000.000000 -10.600000 -1.000000 \n", "25% 28.326200 40950.000000 15.600000 0.000000 \n", "50% 120.730000 116607.000000 23.300000 0.000000 \n", "75% 466.863250 121074.000000 28.300000 0.000000 \n", "max 3592.000000 272278.000000 47.200000 343.000000 \n", "\n", " wind_speed hour age \n", "count 52218.000000 52218.000000 52218.000000 \n", "mean 3.479120 11.504845 43.837278 \n", "std 2.174506 6.923051 33.697352 \n", "min 0.000000 0.000000 0.000000 \n", "25% 2.100000 6.000000 15.000000 \n", "50% 3.100000 12.000000 26.000000 \n", "75% 4.600000 18.000000 60.000000 \n", "max 17.000000 23.000000 105.000000 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Split into features and target\n", "features = ['square_feet', 'air_temperature', 'precip_depth_1_hr', 'wind_speed', 'month', 'weekday', 'hour', 'age']\n", "target = 'meter_reading'\n", "\n", "X_train = train[features]\n", "y_train = train[target]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train Model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Define pipeline for mixed numerical and categorical features\n", "\n", "numerical_features = ['square_feet', 'air_temperature', 'precip_depth_1_hr', 'wind_speed', 'age']\n", "numerical_transformer = Pipeline(\n", " steps=[(\"scaler\", StandardScaler(with_mean=True, with_std=True))\n", " ]\n", " )\n", "\n", "categorical_features = ['month', 'weekday', 'hour']\n", "categorical_transformer = Pipeline(\n", " steps=[(\"onehot\", OneHotEncoder(handle_unknown=\"ignore\"))]\n", " )\n", "\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', numerical_transformer, numerical_features),\n", " ('cat', categorical_transformer, categorical_features)\n", " ])\n", "\n", "pipeline = Pipeline([\n", " ('preprocessor', preprocessor),\n", " ('model', RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=42))\n", "])\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('scaler',\n",
       "                                                                   StandardScaler())]),\n",
       "                                                  ['square_feet',\n",
       "                                                   'air_temperature',\n",
       "                                                   'precip_depth_1_hr',\n",
       "                                                   'wind_speed', 'age']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('onehot',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
       "                                                  ['month', 'weekday',\n",
       "                                                   'hour'])])),\n",
       "                ('model', RandomForestRegressor(n_jobs=-1, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['square_feet',\n", " 'air_temperature',\n", " 'precip_depth_1_hr',\n", " 'wind_speed', 'age']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore'))]),\n", " ['month', 'weekday',\n", " 'hour'])])),\n", " ('model', RandomForestRegressor(n_jobs=-1, random_state=42))])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Store Model\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['pipeline.pkl']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Store pipeline to disk using joblib\n", "joblib.dump(pipeline, 'pipeline.pkl', compress=3) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Model" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "pipeline = joblib.load('pipeline.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test Inference" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
square_feetair_temperatureprecip_depth_1_hrwind_speedmonthweekdayhourage
027227813.90.04.1JanuaryFriday160
\n", "
" ], "text/plain": [ " square_feet air_temperature precip_depth_1_hr wind_speed month \\\n", "0 272278 13.9 0.0 4.1 January \n", "\n", " weekday hour age \n", "0 Friday 1 60 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_building = {}\n", "new_building['square_feet'] = 272278\n", "new_building['air_temperature'] = 13.9\n", "new_building['precip_depth_1_hr'] = 0.0\n", "new_building['wind_speed'] = 4.1\n", "new_building['month'] = 'January'\n", "new_building['weekday'] = 'Friday'\n", "new_building['hour'] = 1\n", "new_building['age'] = 60\n", "\n", "new_building_df = pd.DataFrame(new_building, index=[0])\n", "new_building_df.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([358.077])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.predict(new_building_df)" ] } ], "metadata": { "kernelspec": { "display_name": "greensparks", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }