{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install pygam seaborn statsmodels --quiet" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from sklearn.datasets import fetch_california_housing\n", "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.preprocessing import StandardScaler,OrdinalEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LinearRegression, LogisticRegression\n", "from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score\n", "from sklearn.metrics import mean_squared_error\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import zipfile\n", "from statsmodels.genmod.generalized_linear_model import GLM\n", "from statsmodels.genmod.families import Gamma\n", "from statsmodels.genmod.families.links import Log\n", "from statsmodels.tools import add_constant\n", "from pygam import LinearGAM, GammaGAM, s, f" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CONTROLAGE1METRO3REGIONLMEDFMRL30L50L80IPOV...FMTCOST06RELAMICATFMTCOST08RELAMICATFMTCOST12RELAMICATFMTCOSTMEDRELAMICATFMTINCRELAMICATFMTASSISTEDFMTBURDENFMTREGIONFMTSTATUSHOUSE_AGE
0'036000001146'34'2''4'84200258024950415506650017849...'7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''.''2 30% to 50%''West''-5'9
1'036000001147'43'2''4'84200224127700461507385022629...'7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''.''1 Less than 30%''West''-5'9
2'036000001149'60'2''4'84200257724950415506650017399...'7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''.''1 Less than 30%''West''-5'10
3'036000001150'37'2''4'84200224122200369505910014985...'7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''.''2 30% to 50%''West''-5'10
4'036000001151'33'2''4'84200258027700461507385022557...'7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''7 120% AMI +''.''2 30% to 50%''West''-5'16
\n", "

5 rows × 100 columns

\n", "
" ], "text/plain": [ " CONTROL AGE1 METRO3 REGION LMED FMR L30 L50 L80 \\\n", "0 '036000001146' 34 '2' '4' 84200 2580 24950 41550 66500 \n", "1 '036000001147' 43 '2' '4' 84200 2241 27700 46150 73850 \n", "2 '036000001149' 60 '2' '4' 84200 2577 24950 41550 66500 \n", "3 '036000001150' 37 '2' '4' 84200 2241 22200 36950 59100 \n", "4 '036000001151' 33 '2' '4' 84200 2580 27700 46150 73850 \n", "\n", " IPOV ... FMTCOST06RELAMICAT FMTCOST08RELAMICAT FMTCOST12RELAMICAT \\\n", "0 17849 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n", "1 22629 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n", "2 17399 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n", "3 14985 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n", "4 22557 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n", "\n", " FMTCOSTMEDRELAMICAT FMTINCRELAMICAT FMTASSISTED FMTBURDEN \\\n", "0 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n", "1 '7 120% AMI +' '7 120% AMI +' '.' '1 Less than 30%' \n", "2 '7 120% AMI +' '7 120% AMI +' '.' '1 Less than 30%' \n", "3 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n", "4 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n", "\n", " FMTREGION FMTSTATUS HOUSE_AGE \n", "0 'West' '-5' 9 \n", "1 'West' '-5' 9 \n", "2 'West' '-5' 10 \n", "3 'West' '-5' 10 \n", "4 'West' '-5' 16 \n", "\n", "[5 rows x 100 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_2011, df_2013 = pd.read_csv('thads2011.txt'),pd.read_csv('thads2013n.txt')\n", "df_2011['HOUSE_AGE'], df_2013['HOUSE_AGE'] = 2011 - df_2011['BUILT'], 2013 - df_2013['BUILT']\n", "df = pd.concat([df_2011, df_2013], ignore_index=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = df[df['OWNRENT'] == \"'1'\"]\n", "df = df[df['FMTSTRUCTURETYPE'] == \"'1 Single Family'\"]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
METRO3REGIONLMEDBEDRMSHOUSE_AGEVALUEROOMSPERUTILITY
0'2''4'842004972000083300.000000
1'2''4'842003955000054256.000000
2'2''4'84200510720000113233.000000
3'2''4'8420031045000052152.000000
4'2''4'8420041670000094656.166667
\n", "
" ], "text/plain": [ " METRO3 REGION LMED BEDRMS HOUSE_AGE VALUE ROOMS PER UTILITY\n", "0 '2' '4' 84200 4 9 720000 8 3 300.000000\n", "1 '2' '4' 84200 3 9 550000 5 4 256.000000\n", "2 '2' '4' 84200 5 10 720000 11 3 233.000000\n", "3 '2' '4' 84200 3 10 450000 5 2 152.000000\n", "4 '2' '4' 84200 4 16 700000 9 4 656.166667" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keep_columns = [\n", " 'METRO3',\n", " 'REGION',\n", " 'LMED',\n", " 'BEDRMS',\n", " 'HOUSE_AGE',\n", " 'VALUE',\n", " 'ROOMS',\n", " 'PER',\n", " 'UTILITY',\n", "]\n", "\n", "df = df[keep_columns]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
METRO3REGIONLMEDBEDRMSHOUSE_AGEVALUEROOMSPERUTILITY
024842004972000083300.000000
124842003955000054256.000000
22484200510720000113233.000000
3248420031045000052152.000000
4248420041670000094656.166667
\n", "
" ], "text/plain": [ " METRO3 REGION LMED BEDRMS HOUSE_AGE VALUE ROOMS PER UTILITY\n", "0 2 4 84200 4 9 720000 8 3 300.000000\n", "1 2 4 84200 3 9 550000 5 4 256.000000\n", "2 2 4 84200 5 10 720000 11 3 233.000000\n", "3 2 4 84200 3 10 450000 5 2 152.000000\n", "4 2 4 84200 4 16 700000 9 4 656.166667" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['METRO3'] = df['METRO3'].str.replace(\"'\", \"\").astype(int)\n", "df['REGION'] = df['REGION'].str.replace(\"'\", \"\").astype(int)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "region_code = {\n", " 'Northeast': 1,\n", " 'Midwest': 2,\n", " 'South': 3,\n", " 'West': 4,\n", "}\n", "\n", "metro_code = {\n", " 'Central cities of metropolitan areas': 1,\n", " 'Inside metropolitan area, but not in central city': 2,\n", " 'Inside metropolitan area, but not in central city - rural': 3,\n", " 'Outside metropolitan areas, urbanized': 4,\n", " 'Outside metropolitan areas, rural': 5,\n", "}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.to_csv('hud_dataset.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X, y = df.drop(columns=['VALUE']), df['VALUE']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
METRO3REGIONLMEDBEDRMSHOUSE_AGEROOMSPERUTILITY
22170226710942193126.083333
17193226280034162201.000000
137803418486444175502.500000
39271216420036174301.000000
26364326186439163461.166667
\n", "
" ], "text/plain": [ " METRO3 REGION LMED BEDRMS HOUSE_AGE ROOMS PER UTILITY\n", "22170 2 2 67109 4 21 9 3 126.083333\n", "17193 2 2 62800 3 41 6 2 201.000000\n", "137803 4 1 84864 4 41 7 5 502.500000\n", "39271 2 1 64200 3 61 7 4 301.000000\n", "26364 3 2 61864 3 91 6 3 461.166667" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0% (0 of 11) | | Elapsed Time: 0:00:00 ETA: --:--:--\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 9% (1 of 11) |## | Elapsed Time: 0:00:35 ETA: 0:05:51\n", " 18% (2 of 11) |#### | Elapsed Time: 0:01:09 ETA: 0:05:08\n", " 27% (3 of 11) |###### | Elapsed Time: 0:01:44 ETA: 0:04:38\n", " 36% (4 of 11) |######### | Elapsed Time: 0:02:22 ETA: 0:04:25\n", " 45% (5 of 11) |########### | Elapsed Time: 0:02:55 ETA: 0:03:22\n", " 54% (6 of 11) |############# | Elapsed Time: 0:03:28 ETA: 0:02:43\n", " 63% (7 of 11) |############### | Elapsed Time: 0:04:01 ETA: 0:02:12\n", " 72% (8 of 11) |################## | Elapsed Time: 0:04:36 ETA: 0:01:43\n", " 81% (9 of 11) |#################### | Elapsed Time: 0:05:07 ETA: 0:01:02\n", " 90% (10 of 11) |##################### | Elapsed Time: 0:05:40 ETA: 0:00:33\n", "100% (11 of 11) |########################| Elapsed Time: 0:06:14 Time: 0:06:14\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Mean Squared Error: 57308274833.9465\n", "Root Mean Squared Error: 239391.4678\n" ] } ], "source": [ "gam_model = LinearGAM(s(0, dtype='categorical') \n", " + s(1, dtype='categorical') \n", " + s(2, n_splines=250) \n", " + s(3, dtype='categorical') \n", " + s(4, n_splines=250) \n", " + s(5, dtype='categorical') \n", " + s(6, dtype='categorical') \n", " + s(7, n_splines=250) \n", " )\n", "\n", "gam_model.gridsearch(np.array(X_train), y_train)\n", "gam_model.fit(np.array(X_train), y_train)\n", "gam_test_preds = gam_model.predict(X_test)\n", "\n", "mse = mean_squared_error(y_test, gam_test_preds)\n", "print(f\"Mean Squared Error: {mse:.4f}\")\n", "\n", "rmse = np.sqrt(mse)\n", "print(f\"Root Mean Squared Error: {rmse:.4f}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "with open('gam_model_hud.pkl', 'wb') as file:\n", " pickle.dump(gam_model, file)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "X_train = torch.FloatTensor(X_train.values)\n", "y_train = torch.FloatTensor(y_train.values).reshape(-1, 1)\n", "X_test = torch.FloatTensor(X_test.values)\n", "y_test = torch.FloatTensor(y_test.values).reshape(-1, 1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "class HousePriceModel(nn.Module):\n", " def __init__(self, input_size):\n", " super(HousePriceModel, self).__init__()\n", " self.model = nn.Sequential(\n", " nn.Linear(input_size, 128),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(128, 64),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(64, 32),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(32, 1)\n", " )\n", "\n", " def forward(self, x):\n", " x = self.model(x)\n", " return x\n", "\n", "model = HousePriceModel(X_train.shape[1])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "criterion = nn.MSELoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.0001)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch [10/100], Loss: 461675626496.0000\n", "Epoch [20/100], Loss: 461912539136.0000\n", "Epoch [30/100], Loss: 465912332288.0000\n", "Epoch [40/100], Loss: 465710579712.0000\n", "Epoch [50/100], Loss: 465482645504.0000\n", "Epoch [60/100], Loss: 465170137088.0000\n", "Epoch [70/100], Loss: 464703356928.0000\n", "Epoch [80/100], Loss: 464297361408.0000\n", "Epoch [90/100], Loss: 463879012352.0000\n", "Epoch [100/100], Loss: 463475539968.0000\n" ] } ], "source": [ "num_epochs = 100\n", "batch_size = 32\n", "\n", "for epoch in range(num_epochs):\n", " for i in range(0, len(X_train), batch_size):\n", " batch_X = X_train[i:i+batch_size]\n", " batch_y = y_train[i:i+batch_size]\n", " \n", " outputs = model(batch_X)\n", " loss = criterion(outputs, batch_y)\n", " \n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", " \n", " if (epoch + 1) % 10 == 0:\n", " print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Squared Error: 77202161664.0000\n", "Root Mean Squared Error: 277852.7812\n" ] } ], "source": [ "model.eval()\n", "with torch.no_grad():\n", " y_pred = model(X_test)\n", " mse = criterion(y_pred, y_test)\n", " rmse = torch.sqrt(mse)\n", " print(f'Mean Squared Error: {mse.item():.4f}')\n", " print(f'Root Mean Squared Error: {rmse.item():.4f}')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "torch.save(model.state_dict(), 'dnn_model_hud.pth')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", " Longitude \n", "0 -122.23 \n", "1 -122.22 \n", "2 -122.24 \n", "3 -122.25 \n", "4 -122.25 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "housing = fetch_california_housing()\n", "\n", "X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target)\n", "X.columns = housing.feature_names\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0% (0 of 11) | | Elapsed Time: 0:00:00 ETA: --:--:--\n", " 9% (1 of 11) |## | Elapsed Time: 0:00:36 ETA: 0:06:09\n", " 18% (2 of 11) |#### | Elapsed Time: 0:01:13 ETA: 0:05:24\n", " 27% (3 of 11) |###### | Elapsed Time: 0:01:48 ETA: 0:04:46\n", " 36% (4 of 11) |######### | Elapsed Time: 0:02:25 ETA: 0:04:15\n", " 45% (5 of 11) |########### | Elapsed Time: 0:03:00 ETA: 0:03:30\n", " 54% (6 of 11) |############# | Elapsed Time: 0:03:35 ETA: 0:02:55\n", " 63% (7 of 11) |############### | Elapsed Time: 0:04:13 ETA: 0:02:30\n", " 72% (8 of 11) |################## | Elapsed Time: 0:04:49 ETA: 0:01:48\n", " 81% (9 of 11) |#################### | Elapsed Time: 0:05:29 ETA: 0:01:20\n", " 90% (10 of 11) |##################### | Elapsed Time: 0:06:09 ETA: 0:00:40\n", "100% (11 of 11) |########################| Elapsed Time: 0:06:47 Time: 0:06:47\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Mean Squared Error: 0.3081\n", "Root Mean Squared Error: 0.5550\n" ] } ], "source": [ "gam_model = LinearGAM(s(0, n_splines=250) \n", " + s(1, n_splines=250) \n", " + s(2, n_splines=250) \n", " + s(3, n_splines=250) \n", " + s(4, n_splines=250) \n", " + s(5, n_splines=250) \n", " + s(6, n_splines=250) \n", " + s(7, n_splines=250) \n", " )\n", "gam_model.gridsearch(np.array(X_train), y_train)\n", "gam_model.fit(np.array(X_train), y_train)\n", "gam_test_preds = gam_model.predict(X_test)\n", "\n", "mse = mean_squared_error(y_test, gam_test_preds)\n", "print(f\"Mean Squared Error: {mse:.4f}\")\n", "\n", "rmse = np.sqrt(mse)\n", "print(f\"Root Mean Squared Error: {rmse:.4f}\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "with open('gam_model_california.pkl', 'wb') as file:\n", " pickle.dump(gam_model, file)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "X_train = torch.FloatTensor(X_train.values)\n", "y_train = torch.FloatTensor(y_train.values).reshape(-1, 1)\n", "X_test = torch.FloatTensor(X_test.values)\n", "y_test = torch.FloatTensor(y_test.values).reshape(-1, 1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "class HousePriceModel(nn.Module):\n", " def __init__(self, input_size):\n", " super(HousePriceModel, self).__init__()\n", " self.model = nn.Sequential(\n", " nn.Linear(input_size, 128),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(128, 64),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(64, 32),\n", " nn.LeakyReLU(0.2,inplace=True),\n", " nn.Linear(32, 1)\n", " )\n", "\n", " def forward(self, x):\n", " x = self.model(x)\n", " return x\n", "\n", "model = HousePriceModel(X_train.shape[1])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "criterion = nn.MSELoss()\n", "optimizer = optim.Adam(model.parameters(), lr=0.0001)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch [10/100], Loss: 1.0281\n", "Epoch [20/100], Loss: 0.7823\n", "Epoch [30/100], Loss: 0.6930\n", "Epoch [40/100], Loss: 0.7010\n", "Epoch [50/100], Loss: 0.6770\n", "Epoch [60/100], Loss: 0.6713\n", "Epoch [70/100], Loss: 0.6740\n", "Epoch [80/100], Loss: 0.7874\n", "Epoch [90/100], Loss: 0.7111\n", "Epoch [100/100], Loss: 0.6300\n" ] } ], "source": [ "num_epochs = 100\n", "batch_size = 32\n", "\n", "for epoch in range(num_epochs):\n", " for i in range(0, len(X_train), batch_size):\n", " batch_X = X_train[i:i+batch_size]\n", " batch_y = y_train[i:i+batch_size]\n", " \n", " outputs = model(batch_X)\n", " loss = criterion(outputs, batch_y)\n", " \n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", " \n", " if (epoch + 1) % 10 == 0:\n", " print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Squared Error: 0.9678\n", "Root Mean Squared Error: 0.9838\n" ] } ], "source": [ "model.eval()\n", "with torch.no_grad():\n", " y_pred = model(X_test)\n", " mse = criterion(y_pred, y_test)\n", " rmse = torch.sqrt(mse)\n", " print(f'Mean Squared Error: {mse.item():.4f}')\n", " print(f'Root Mean Squared Error: {rmse.item():.4f}')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "torch.save(model.state_dict(), 'dnn_model_california.pth')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LatitudeLongitudeCountyCityIncorportation_datepop_april_1980pop_april_1990pop_april_2000pop_april_2010
034.582769-117.409214San BernardinoAdelanto19702164.085171813031765
134.153339-118.761675Los AngelesAgoura Hills198220390.0203902053720330
237.765206-122.241636AlamedaAlameda185463852.0764597225973812
337.886869-122.297747AlamedaAlbany190815130.0163271644418539
434.095286-118.127014Los AngelesAlhambra190364767.0821068580483089
\n", "
" ], "text/plain": [ " Latitude Longitude County City Incorportation_date \\\n", "0 34.582769 -117.409214 San Bernardino Adelanto 1970 \n", "1 34.153339 -118.761675 Los Angeles Agoura Hills 1982 \n", "2 37.765206 -122.241636 Alameda Alameda 1854 \n", "3 37.886869 -122.297747 Alameda Albany 1908 \n", "4 34.095286 -118.127014 Los Angeles Alhambra 1903 \n", "\n", " pop_april_1980 pop_april_1990 pop_april_2000 pop_april_2010 \n", "0 2164.0 8517 18130 31765 \n", "1 20390.0 20390 20537 20330 \n", "2 63852.0 76459 72259 73812 \n", "3 15130.0 16327 16444 18539 \n", "4 64767.0 82106 85804 83089 " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('california_cities.csv')\n", "df = df.iloc[:, 1:]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "df = df.groupby(['County']).agg({\n", " 'Latitude': 'mean',\n", " 'Longitude': 'mean',\n", " 'pop_april_1990': 'sum',\n", "})" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df = df.reset_index()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "df.to_csv('california_counties.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "cnn_module", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.15" } }, "nbformat": 4, "nbformat_minor": 2 }