{ "cells": [ { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.8841.0880.0129.0322.0126.08.3252452600.0NEAR BAY
1-122.2237.8621.07099.01106.02401.01138.08.3014358500.0NEAR BAY
2-122.2437.8552.01467.0190.0496.0177.07.2574352100.0NEAR BAY
3-122.2537.8552.01274.0235.0558.0219.05.6431341300.0NEAR BAY
4-122.2537.8552.01627.0280.0565.0259.03.8462342200.0NEAR BAY
\n", "
" ], "text/plain": [ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", "0 -122.23 37.88 41.0 880.0 129.0 \n", "1 -122.22 37.86 21.0 7099.0 1106.0 \n", "2 -122.24 37.85 52.0 1467.0 190.0 \n", "3 -122.25 37.85 52.0 1274.0 235.0 \n", "4 -122.25 37.85 52.0 1627.0 280.0 \n", "\n", " population households median_income median_house_value ocean_proximity \n", "0 322.0 126.0 8.3252 452600.0 NEAR BAY \n", "1 2401.0 1138.0 8.3014 358500.0 NEAR BAY \n", "2 496.0 177.0 7.2574 352100.0 NEAR BAY \n", "3 558.0 219.0 5.6431 341300.0 NEAR BAY \n", "4 565.0 259.0 3.8462 342200.0 NEAR BAY " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "\n", "data=pd.read_csv('Data/housing.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "longitude 0\n", "latitude 0\n", "housing_median_age 0\n", "total_rooms 0\n", "total_bedrooms 207\n", "population 0\n", "households 0\n", "median_income 0\n", "median_house_value 0\n", "ocean_proximity 0\n", "dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\PMLS\\AppData\\Local\\Temp\\ipykernel_1784\\1172000250.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", "\n", "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", "\n", "\n", " data['total_bedrooms'].fillna(data['total_rooms'] / ratio, inplace=True)\n" ] }, { "data": { "text/plain": [ "longitude 0\n", "latitude 0\n", "housing_median_age 0\n", "total_rooms 0\n", "total_bedrooms 0\n", "population 0\n", "households 0\n", "median_income 0\n", "median_house_value 0\n", "ocean_proximity 0\n", "dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratio = (data['total_rooms'] / data['total_bedrooms']).dropna().mean()\n", "data['total_bedrooms'].fillna(data['total_rooms'] / ratio, inplace=True)\n", "data.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.880.982119-0.804800-0.971082-0.974405-0.9770098.3252452600.0NEAR BAY
1-122.2237.86-0.6070042.0458411.3506820.8614181.6699218.3014358500.0NEAR BAY
2-122.2437.851.856137-0.535733-0.826120-0.820757-0.8436167.2574352100.0NEAR BAY
3-122.2537.851.856137-0.624199-0.719181-0.766010-0.7337645.6431341300.0NEAR BAY
4-122.2537.851.856137-0.462393-0.612242-0.759828-0.6291423.8462342200.0NEAR BAY
\n", "
" ], "text/plain": [ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", "0 -122.23 37.88 0.982119 -0.804800 -0.971082 \n", "1 -122.22 37.86 -0.607004 2.045841 1.350682 \n", "2 -122.24 37.85 1.856137 -0.535733 -0.826120 \n", "3 -122.25 37.85 1.856137 -0.624199 -0.719181 \n", "4 -122.25 37.85 1.856137 -0.462393 -0.612242 \n", "\n", " population households median_income median_house_value ocean_proximity \n", "0 -0.974405 -0.977009 8.3252 452600.0 NEAR BAY \n", "1 0.861418 1.669921 8.3014 358500.0 NEAR BAY \n", "2 -0.820757 -0.843616 7.2574 352100.0 NEAR BAY \n", "3 -0.766010 -0.733764 5.6431 341300.0 NEAR BAY \n", "4 -0.759828 -0.629142 3.8462 342200.0 NEAR BAY " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['housing_median_age']=(data['housing_median_age']-data['housing_median_age'].mean())/data['housing_median_age'].std(ddof=1)\n", "data['total_rooms']=(data['total_rooms']-data['total_rooms'].mean())/data['total_rooms'].std(ddof=1)\n", "data['total_bedrooms']=(data['total_bedrooms']-data['total_bedrooms'].mean())/data['total_bedrooms'].std(ddof=1)\n", "data['population']=(data['population']-data['population'].mean())/data['population'].std(ddof=1)\n", "data['households']=(data['households']-data['households'].mean())/data['households'].std(ddof=1)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ocean_proximity\n", "<1H OCEAN 9136\n", "INLAND 6551\n", "NEAR OCEAN 2658\n", "NEAR BAY 2290\n", "ISLAND 5\n", "Name: count, dtype: int64" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "data['ocean_proximity'].value_counts()\n", "\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "mapping = {\n", " '<1H OCEAN': 0,\n", " 'INLAND': 1,\n", " 'NEAR OCEAN': 2,\n", " 'NEAR BAY': 3,\n", " 'ISLAND': 4\n", "}\n", "\n", "data['ocean_proximity'] = data['ocean_proximity'].map(mapping)\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.880.982119-0.804800-0.971082-0.974405-0.9770098.3252452600.03
1-122.2237.86-0.6070042.0458411.3506820.8614181.6699218.3014358500.03
2-122.2437.851.856137-0.535733-0.826120-0.820757-0.8436167.2574352100.03
3-122.2537.851.856137-0.624199-0.719181-0.766010-0.7337645.6431341300.03
4-122.2537.851.856137-0.462393-0.612242-0.759828-0.6291423.8462342200.03
\n", "
" ], "text/plain": [ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", "0 -122.23 37.88 0.982119 -0.804800 -0.971082 \n", "1 -122.22 37.86 -0.607004 2.045841 1.350682 \n", "2 -122.24 37.85 1.856137 -0.535733 -0.826120 \n", "3 -122.25 37.85 1.856137 -0.624199 -0.719181 \n", "4 -122.25 37.85 1.856137 -0.462393 -0.612242 \n", "\n", " population households median_income median_house_value ocean_proximity \n", "0 -0.974405 -0.977009 8.3252 452600.0 3 \n", "1 0.861418 1.669921 8.3014 358500.0 3 \n", "2 -0.820757 -0.843616 7.2574 352100.0 3 \n", "3 -0.766010 -0.733764 5.6431 341300.0 3 \n", "4 -0.759828 -0.629142 3.8462 342200.0 3 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "data.to_csv(\"processed_data.csv\", index=False) # Save without index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }