{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install pygam seaborn statsmodels --quiet"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from sklearn.datasets import fetch_california_housing\n",
"from sklearn.model_selection import train_test_split\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.preprocessing import StandardScaler,OrdinalEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score\n",
"from sklearn.metrics import mean_squared_error\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import zipfile\n",
"from statsmodels.genmod.generalized_linear_model import GLM\n",
"from statsmodels.genmod.families import Gamma\n",
"from statsmodels.genmod.families.links import Log\n",
"from statsmodels.tools import add_constant\n",
"from pygam import LinearGAM, GammaGAM, s, f"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CONTROL | \n",
" AGE1 | \n",
" METRO3 | \n",
" REGION | \n",
" LMED | \n",
" FMR | \n",
" L30 | \n",
" L50 | \n",
" L80 | \n",
" IPOV | \n",
" ... | \n",
" FMTCOST06RELAMICAT | \n",
" FMTCOST08RELAMICAT | \n",
" FMTCOST12RELAMICAT | \n",
" FMTCOSTMEDRELAMICAT | \n",
" FMTINCRELAMICAT | \n",
" FMTASSISTED | \n",
" FMTBURDEN | \n",
" FMTREGION | \n",
" FMTSTATUS | \n",
" HOUSE_AGE | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" '036000001146' | \n",
" 34 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 2580 | \n",
" 24950 | \n",
" 41550 | \n",
" 66500 | \n",
" 17849 | \n",
" ... | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '.' | \n",
" '2 30% to 50%' | \n",
" 'West' | \n",
" '-5' | \n",
" 9 | \n",
"
\n",
" \n",
" | 1 | \n",
" '036000001147' | \n",
" 43 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 2241 | \n",
" 27700 | \n",
" 46150 | \n",
" 73850 | \n",
" 22629 | \n",
" ... | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '.' | \n",
" '1 Less than 30%' | \n",
" 'West' | \n",
" '-5' | \n",
" 9 | \n",
"
\n",
" \n",
" | 2 | \n",
" '036000001149' | \n",
" 60 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 2577 | \n",
" 24950 | \n",
" 41550 | \n",
" 66500 | \n",
" 17399 | \n",
" ... | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '.' | \n",
" '1 Less than 30%' | \n",
" 'West' | \n",
" '-5' | \n",
" 10 | \n",
"
\n",
" \n",
" | 3 | \n",
" '036000001150' | \n",
" 37 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 2241 | \n",
" 22200 | \n",
" 36950 | \n",
" 59100 | \n",
" 14985 | \n",
" ... | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '.' | \n",
" '2 30% to 50%' | \n",
" 'West' | \n",
" '-5' | \n",
" 10 | \n",
"
\n",
" \n",
" | 4 | \n",
" '036000001151' | \n",
" 33 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 2580 | \n",
" 27700 | \n",
" 46150 | \n",
" 73850 | \n",
" 22557 | \n",
" ... | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '7 120% AMI +' | \n",
" '.' | \n",
" '2 30% to 50%' | \n",
" 'West' | \n",
" '-5' | \n",
" 16 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 100 columns
\n",
"
"
],
"text/plain": [
" CONTROL AGE1 METRO3 REGION LMED FMR L30 L50 L80 \\\n",
"0 '036000001146' 34 '2' '4' 84200 2580 24950 41550 66500 \n",
"1 '036000001147' 43 '2' '4' 84200 2241 27700 46150 73850 \n",
"2 '036000001149' 60 '2' '4' 84200 2577 24950 41550 66500 \n",
"3 '036000001150' 37 '2' '4' 84200 2241 22200 36950 59100 \n",
"4 '036000001151' 33 '2' '4' 84200 2580 27700 46150 73850 \n",
"\n",
" IPOV ... FMTCOST06RELAMICAT FMTCOST08RELAMICAT FMTCOST12RELAMICAT \\\n",
"0 17849 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n",
"1 22629 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n",
"2 17399 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n",
"3 14985 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n",
"4 22557 ... '7 120% AMI +' '7 120% AMI +' '7 120% AMI +' \n",
"\n",
" FMTCOSTMEDRELAMICAT FMTINCRELAMICAT FMTASSISTED FMTBURDEN \\\n",
"0 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n",
"1 '7 120% AMI +' '7 120% AMI +' '.' '1 Less than 30%' \n",
"2 '7 120% AMI +' '7 120% AMI +' '.' '1 Less than 30%' \n",
"3 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n",
"4 '7 120% AMI +' '7 120% AMI +' '.' '2 30% to 50%' \n",
"\n",
" FMTREGION FMTSTATUS HOUSE_AGE \n",
"0 'West' '-5' 9 \n",
"1 'West' '-5' 9 \n",
"2 'West' '-5' 10 \n",
"3 'West' '-5' 10 \n",
"4 'West' '-5' 16 \n",
"\n",
"[5 rows x 100 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_2011, df_2013 = pd.read_csv('thads2011.txt'),pd.read_csv('thads2013n.txt')\n",
"df_2011['HOUSE_AGE'], df_2013['HOUSE_AGE'] = 2011 - df_2011['BUILT'], 2013 - df_2013['BUILT']\n",
"df = pd.concat([df_2011, df_2013], ignore_index=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = df[df['OWNRENT'] == \"'1'\"]\n",
"df = df[df['FMTSTRUCTURETYPE'] == \"'1 Single Family'\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" METRO3 | \n",
" REGION | \n",
" LMED | \n",
" BEDRMS | \n",
" HOUSE_AGE | \n",
" VALUE | \n",
" ROOMS | \n",
" PER | \n",
" UTILITY | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 4 | \n",
" 9 | \n",
" 720000 | \n",
" 8 | \n",
" 3 | \n",
" 300.000000 | \n",
"
\n",
" \n",
" | 1 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 3 | \n",
" 9 | \n",
" 550000 | \n",
" 5 | \n",
" 4 | \n",
" 256.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 5 | \n",
" 10 | \n",
" 720000 | \n",
" 11 | \n",
" 3 | \n",
" 233.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 3 | \n",
" 10 | \n",
" 450000 | \n",
" 5 | \n",
" 2 | \n",
" 152.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" '2' | \n",
" '4' | \n",
" 84200 | \n",
" 4 | \n",
" 16 | \n",
" 700000 | \n",
" 9 | \n",
" 4 | \n",
" 656.166667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" METRO3 REGION LMED BEDRMS HOUSE_AGE VALUE ROOMS PER UTILITY\n",
"0 '2' '4' 84200 4 9 720000 8 3 300.000000\n",
"1 '2' '4' 84200 3 9 550000 5 4 256.000000\n",
"2 '2' '4' 84200 5 10 720000 11 3 233.000000\n",
"3 '2' '4' 84200 3 10 450000 5 2 152.000000\n",
"4 '2' '4' 84200 4 16 700000 9 4 656.166667"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keep_columns = [\n",
" 'METRO3',\n",
" 'REGION',\n",
" 'LMED',\n",
" 'BEDRMS',\n",
" 'HOUSE_AGE',\n",
" 'VALUE',\n",
" 'ROOMS',\n",
" 'PER',\n",
" 'UTILITY',\n",
"]\n",
"\n",
"df = df[keep_columns]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" METRO3 | \n",
" REGION | \n",
" LMED | \n",
" BEDRMS | \n",
" HOUSE_AGE | \n",
" VALUE | \n",
" ROOMS | \n",
" PER | \n",
" UTILITY | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2 | \n",
" 4 | \n",
" 84200 | \n",
" 4 | \n",
" 9 | \n",
" 720000 | \n",
" 8 | \n",
" 3 | \n",
" 300.000000 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 4 | \n",
" 84200 | \n",
" 3 | \n",
" 9 | \n",
" 550000 | \n",
" 5 | \n",
" 4 | \n",
" 256.000000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 4 | \n",
" 84200 | \n",
" 5 | \n",
" 10 | \n",
" 720000 | \n",
" 11 | \n",
" 3 | \n",
" 233.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 4 | \n",
" 84200 | \n",
" 3 | \n",
" 10 | \n",
" 450000 | \n",
" 5 | \n",
" 2 | \n",
" 152.000000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2 | \n",
" 4 | \n",
" 84200 | \n",
" 4 | \n",
" 16 | \n",
" 700000 | \n",
" 9 | \n",
" 4 | \n",
" 656.166667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" METRO3 REGION LMED BEDRMS HOUSE_AGE VALUE ROOMS PER UTILITY\n",
"0 2 4 84200 4 9 720000 8 3 300.000000\n",
"1 2 4 84200 3 9 550000 5 4 256.000000\n",
"2 2 4 84200 5 10 720000 11 3 233.000000\n",
"3 2 4 84200 3 10 450000 5 2 152.000000\n",
"4 2 4 84200 4 16 700000 9 4 656.166667"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['METRO3'] = df['METRO3'].str.replace(\"'\", \"\").astype(int)\n",
"df['REGION'] = df['REGION'].str.replace(\"'\", \"\").astype(int)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"region_code = {\n",
" 'Northeast': 1,\n",
" 'Midwest': 2,\n",
" 'South': 3,\n",
" 'West': 4,\n",
"}\n",
"\n",
"metro_code = {\n",
" 'Central cities of metropolitan areas': 1,\n",
" 'Inside metropolitan area, but not in central city': 2,\n",
" 'Inside metropolitan area, but not in central city - rural': 3,\n",
" 'Outside metropolitan areas, urbanized': 4,\n",
" 'Outside metropolitan areas, rural': 5,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('hud_dataset.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X, y = df.drop(columns=['VALUE']), df['VALUE']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" METRO3 | \n",
" REGION | \n",
" LMED | \n",
" BEDRMS | \n",
" HOUSE_AGE | \n",
" ROOMS | \n",
" PER | \n",
" UTILITY | \n",
"
\n",
" \n",
" \n",
" \n",
" | 22170 | \n",
" 2 | \n",
" 2 | \n",
" 67109 | \n",
" 4 | \n",
" 21 | \n",
" 9 | \n",
" 3 | \n",
" 126.083333 | \n",
"
\n",
" \n",
" | 17193 | \n",
" 2 | \n",
" 2 | \n",
" 62800 | \n",
" 3 | \n",
" 41 | \n",
" 6 | \n",
" 2 | \n",
" 201.000000 | \n",
"
\n",
" \n",
" | 137803 | \n",
" 4 | \n",
" 1 | \n",
" 84864 | \n",
" 4 | \n",
" 41 | \n",
" 7 | \n",
" 5 | \n",
" 502.500000 | \n",
"
\n",
" \n",
" | 39271 | \n",
" 2 | \n",
" 1 | \n",
" 64200 | \n",
" 3 | \n",
" 61 | \n",
" 7 | \n",
" 4 | \n",
" 301.000000 | \n",
"
\n",
" \n",
" | 26364 | \n",
" 3 | \n",
" 2 | \n",
" 61864 | \n",
" 3 | \n",
" 91 | \n",
" 6 | \n",
" 3 | \n",
" 461.166667 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" METRO3 REGION LMED BEDRMS HOUSE_AGE ROOMS PER UTILITY\n",
"22170 2 2 67109 4 21 9 3 126.083333\n",
"17193 2 2 62800 3 41 6 2 201.000000\n",
"137803 4 1 84864 4 41 7 5 502.500000\n",
"39271 2 1 64200 3 61 7 4 301.000000\n",
"26364 3 2 61864 3 91 6 3 461.166667"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0% (0 of 11) | | Elapsed Time: 0:00:00 ETA: --:--:--\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 9% (1 of 11) |## | Elapsed Time: 0:00:35 ETA: 0:05:51\n",
" 18% (2 of 11) |#### | Elapsed Time: 0:01:09 ETA: 0:05:08\n",
" 27% (3 of 11) |###### | Elapsed Time: 0:01:44 ETA: 0:04:38\n",
" 36% (4 of 11) |######### | Elapsed Time: 0:02:22 ETA: 0:04:25\n",
" 45% (5 of 11) |########### | Elapsed Time: 0:02:55 ETA: 0:03:22\n",
" 54% (6 of 11) |############# | Elapsed Time: 0:03:28 ETA: 0:02:43\n",
" 63% (7 of 11) |############### | Elapsed Time: 0:04:01 ETA: 0:02:12\n",
" 72% (8 of 11) |################## | Elapsed Time: 0:04:36 ETA: 0:01:43\n",
" 81% (9 of 11) |#################### | Elapsed Time: 0:05:07 ETA: 0:01:02\n",
" 90% (10 of 11) |##################### | Elapsed Time: 0:05:40 ETA: 0:00:33\n",
"100% (11 of 11) |########################| Elapsed Time: 0:06:14 Time: 0:06:14\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 57308274833.9465\n",
"Root Mean Squared Error: 239391.4678\n"
]
}
],
"source": [
"gam_model = LinearGAM(s(0, dtype='categorical') \n",
" + s(1, dtype='categorical') \n",
" + s(2, n_splines=250) \n",
" + s(3, dtype='categorical') \n",
" + s(4, n_splines=250) \n",
" + s(5, dtype='categorical') \n",
" + s(6, dtype='categorical') \n",
" + s(7, n_splines=250) \n",
" )\n",
"\n",
"gam_model.gridsearch(np.array(X_train), y_train)\n",
"gam_model.fit(np.array(X_train), y_train)\n",
"gam_test_preds = gam_model.predict(X_test)\n",
"\n",
"mse = mean_squared_error(y_test, gam_test_preds)\n",
"print(f\"Mean Squared Error: {mse:.4f}\")\n",
"\n",
"rmse = np.sqrt(mse)\n",
"print(f\"Root Mean Squared Error: {rmse:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"with open('gam_model_hud.pkl', 'wb') as file:\n",
" pickle.dump(gam_model, file)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"X_train = torch.FloatTensor(X_train.values)\n",
"y_train = torch.FloatTensor(y_train.values).reshape(-1, 1)\n",
"X_test = torch.FloatTensor(X_test.values)\n",
"y_test = torch.FloatTensor(y_test.values).reshape(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"class HousePriceModel(nn.Module):\n",
" def __init__(self, input_size):\n",
" super(HousePriceModel, self).__init__()\n",
" self.model = nn.Sequential(\n",
" nn.Linear(input_size, 128),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(128, 64),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(64, 32),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(32, 1)\n",
" )\n",
"\n",
" def forward(self, x):\n",
" x = self.model(x)\n",
" return x\n",
"\n",
"model = HousePriceModel(X_train.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"criterion = nn.MSELoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch [10/100], Loss: 461675626496.0000\n",
"Epoch [20/100], Loss: 461912539136.0000\n",
"Epoch [30/100], Loss: 465912332288.0000\n",
"Epoch [40/100], Loss: 465710579712.0000\n",
"Epoch [50/100], Loss: 465482645504.0000\n",
"Epoch [60/100], Loss: 465170137088.0000\n",
"Epoch [70/100], Loss: 464703356928.0000\n",
"Epoch [80/100], Loss: 464297361408.0000\n",
"Epoch [90/100], Loss: 463879012352.0000\n",
"Epoch [100/100], Loss: 463475539968.0000\n"
]
}
],
"source": [
"num_epochs = 100\n",
"batch_size = 32\n",
"\n",
"for epoch in range(num_epochs):\n",
" for i in range(0, len(X_train), batch_size):\n",
" batch_X = X_train[i:i+batch_size]\n",
" batch_y = y_train[i:i+batch_size]\n",
" \n",
" outputs = model(batch_X)\n",
" loss = criterion(outputs, batch_y)\n",
" \n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" if (epoch + 1) % 10 == 0:\n",
" print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 77202161664.0000\n",
"Root Mean Squared Error: 277852.7812\n"
]
}
],
"source": [
"model.eval()\n",
"with torch.no_grad():\n",
" y_pred = model(X_test)\n",
" mse = criterion(y_pred, y_test)\n",
" rmse = torch.sqrt(mse)\n",
" print(f'Mean Squared Error: {mse.item():.4f}')\n",
" print(f'Root Mean Squared Error: {rmse.item():.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), 'dnn_model_hud.pth')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MedInc | \n",
" HouseAge | \n",
" AveRooms | \n",
" AveBedrms | \n",
" Population | \n",
" AveOccup | \n",
" Latitude | \n",
" Longitude | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 8.3252 | \n",
" 41.0 | \n",
" 6.984127 | \n",
" 1.023810 | \n",
" 322.0 | \n",
" 2.555556 | \n",
" 37.88 | \n",
" -122.23 | \n",
"
\n",
" \n",
" | 1 | \n",
" 8.3014 | \n",
" 21.0 | \n",
" 6.238137 | \n",
" 0.971880 | \n",
" 2401.0 | \n",
" 2.109842 | \n",
" 37.86 | \n",
" -122.22 | \n",
"
\n",
" \n",
" | 2 | \n",
" 7.2574 | \n",
" 52.0 | \n",
" 8.288136 | \n",
" 1.073446 | \n",
" 496.0 | \n",
" 2.802260 | \n",
" 37.85 | \n",
" -122.24 | \n",
"
\n",
" \n",
" | 3 | \n",
" 5.6431 | \n",
" 52.0 | \n",
" 5.817352 | \n",
" 1.073059 | \n",
" 558.0 | \n",
" 2.547945 | \n",
" 37.85 | \n",
" -122.25 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3.8462 | \n",
" 52.0 | \n",
" 6.281853 | \n",
" 1.081081 | \n",
" 565.0 | \n",
" 2.181467 | \n",
" 37.85 | \n",
" -122.25 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n",
"0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n",
"1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n",
"2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n",
"3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n",
"4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n",
"\n",
" Longitude \n",
"0 -122.23 \n",
"1 -122.22 \n",
"2 -122.24 \n",
"3 -122.25 \n",
"4 -122.25 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"housing = fetch_california_housing()\n",
"\n",
"X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target)\n",
"X.columns = housing.feature_names\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0% (0 of 11) | | Elapsed Time: 0:00:00 ETA: --:--:--\n",
" 9% (1 of 11) |## | Elapsed Time: 0:00:36 ETA: 0:06:09\n",
" 18% (2 of 11) |#### | Elapsed Time: 0:01:13 ETA: 0:05:24\n",
" 27% (3 of 11) |###### | Elapsed Time: 0:01:48 ETA: 0:04:46\n",
" 36% (4 of 11) |######### | Elapsed Time: 0:02:25 ETA: 0:04:15\n",
" 45% (5 of 11) |########### | Elapsed Time: 0:03:00 ETA: 0:03:30\n",
" 54% (6 of 11) |############# | Elapsed Time: 0:03:35 ETA: 0:02:55\n",
" 63% (7 of 11) |############### | Elapsed Time: 0:04:13 ETA: 0:02:30\n",
" 72% (8 of 11) |################## | Elapsed Time: 0:04:49 ETA: 0:01:48\n",
" 81% (9 of 11) |#################### | Elapsed Time: 0:05:29 ETA: 0:01:20\n",
" 90% (10 of 11) |##################### | Elapsed Time: 0:06:09 ETA: 0:00:40\n",
"100% (11 of 11) |########################| Elapsed Time: 0:06:47 Time: 0:06:47\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 0.3081\n",
"Root Mean Squared Error: 0.5550\n"
]
}
],
"source": [
"gam_model = LinearGAM(s(0, n_splines=250) \n",
" + s(1, n_splines=250) \n",
" + s(2, n_splines=250) \n",
" + s(3, n_splines=250) \n",
" + s(4, n_splines=250) \n",
" + s(5, n_splines=250) \n",
" + s(6, n_splines=250) \n",
" + s(7, n_splines=250) \n",
" )\n",
"gam_model.gridsearch(np.array(X_train), y_train)\n",
"gam_model.fit(np.array(X_train), y_train)\n",
"gam_test_preds = gam_model.predict(X_test)\n",
"\n",
"mse = mean_squared_error(y_test, gam_test_preds)\n",
"print(f\"Mean Squared Error: {mse:.4f}\")\n",
"\n",
"rmse = np.sqrt(mse)\n",
"print(f\"Root Mean Squared Error: {rmse:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"with open('gam_model_california.pkl', 'wb') as file:\n",
" pickle.dump(gam_model, file)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"X_train = torch.FloatTensor(X_train.values)\n",
"y_train = torch.FloatTensor(y_train.values).reshape(-1, 1)\n",
"X_test = torch.FloatTensor(X_test.values)\n",
"y_test = torch.FloatTensor(y_test.values).reshape(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"class HousePriceModel(nn.Module):\n",
" def __init__(self, input_size):\n",
" super(HousePriceModel, self).__init__()\n",
" self.model = nn.Sequential(\n",
" nn.Linear(input_size, 128),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(128, 64),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(64, 32),\n",
" nn.LeakyReLU(0.2,inplace=True),\n",
" nn.Linear(32, 1)\n",
" )\n",
"\n",
" def forward(self, x):\n",
" x = self.model(x)\n",
" return x\n",
"\n",
"model = HousePriceModel(X_train.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"criterion = nn.MSELoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch [10/100], Loss: 1.0281\n",
"Epoch [20/100], Loss: 0.7823\n",
"Epoch [30/100], Loss: 0.6930\n",
"Epoch [40/100], Loss: 0.7010\n",
"Epoch [50/100], Loss: 0.6770\n",
"Epoch [60/100], Loss: 0.6713\n",
"Epoch [70/100], Loss: 0.6740\n",
"Epoch [80/100], Loss: 0.7874\n",
"Epoch [90/100], Loss: 0.7111\n",
"Epoch [100/100], Loss: 0.6300\n"
]
}
],
"source": [
"num_epochs = 100\n",
"batch_size = 32\n",
"\n",
"for epoch in range(num_epochs):\n",
" for i in range(0, len(X_train), batch_size):\n",
" batch_X = X_train[i:i+batch_size]\n",
" batch_y = y_train[i:i+batch_size]\n",
" \n",
" outputs = model(batch_X)\n",
" loss = criterion(outputs, batch_y)\n",
" \n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" if (epoch + 1) % 10 == 0:\n",
" print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 0.9678\n",
"Root Mean Squared Error: 0.9838\n"
]
}
],
"source": [
"model.eval()\n",
"with torch.no_grad():\n",
" y_pred = model(X_test)\n",
" mse = criterion(y_pred, y_test)\n",
" rmse = torch.sqrt(mse)\n",
" print(f'Mean Squared Error: {mse.item():.4f}')\n",
" print(f'Root Mean Squared Error: {rmse.item():.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), 'dnn_model_california.pth')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Latitude | \n",
" Longitude | \n",
" County | \n",
" City | \n",
" Incorportation_date | \n",
" pop_april_1980 | \n",
" pop_april_1990 | \n",
" pop_april_2000 | \n",
" pop_april_2010 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 34.582769 | \n",
" -117.409214 | \n",
" San Bernardino | \n",
" Adelanto | \n",
" 1970 | \n",
" 2164.0 | \n",
" 8517 | \n",
" 18130 | \n",
" 31765 | \n",
"
\n",
" \n",
" | 1 | \n",
" 34.153339 | \n",
" -118.761675 | \n",
" Los Angeles | \n",
" Agoura Hills | \n",
" 1982 | \n",
" 20390.0 | \n",
" 20390 | \n",
" 20537 | \n",
" 20330 | \n",
"
\n",
" \n",
" | 2 | \n",
" 37.765206 | \n",
" -122.241636 | \n",
" Alameda | \n",
" Alameda | \n",
" 1854 | \n",
" 63852.0 | \n",
" 76459 | \n",
" 72259 | \n",
" 73812 | \n",
"
\n",
" \n",
" | 3 | \n",
" 37.886869 | \n",
" -122.297747 | \n",
" Alameda | \n",
" Albany | \n",
" 1908 | \n",
" 15130.0 | \n",
" 16327 | \n",
" 16444 | \n",
" 18539 | \n",
"
\n",
" \n",
" | 4 | \n",
" 34.095286 | \n",
" -118.127014 | \n",
" Los Angeles | \n",
" Alhambra | \n",
" 1903 | \n",
" 64767.0 | \n",
" 82106 | \n",
" 85804 | \n",
" 83089 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Latitude Longitude County City Incorportation_date \\\n",
"0 34.582769 -117.409214 San Bernardino Adelanto 1970 \n",
"1 34.153339 -118.761675 Los Angeles Agoura Hills 1982 \n",
"2 37.765206 -122.241636 Alameda Alameda 1854 \n",
"3 37.886869 -122.297747 Alameda Albany 1908 \n",
"4 34.095286 -118.127014 Los Angeles Alhambra 1903 \n",
"\n",
" pop_april_1980 pop_april_1990 pop_april_2000 pop_april_2010 \n",
"0 2164.0 8517 18130 31765 \n",
"1 20390.0 20390 20537 20330 \n",
"2 63852.0 76459 72259 73812 \n",
"3 15130.0 16327 16444 18539 \n",
"4 64767.0 82106 85804 83089 "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('california_cities.csv')\n",
"df = df.iloc[:, 1:]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"df = df.groupby(['County']).agg({\n",
" 'Latitude': 'mean',\n",
" 'Longitude': 'mean',\n",
" 'pop_april_1990': 'sum',\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"df = df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('california_counties.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "cnn_module",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}