uk_housing / data_utils.py
seanerons's picture
Upload 7 files
3512a86 verified
from pathlib import Path
from datetime import datetime
from typing import Optional
import numpy as np
import pandas as pd
DATA_CANDIDATES = [
"data/uk_real_estate_dataset_with_revenue (1).csv",
"data/uk_real_estate_dataset_with_revenue.csv",
"data/uk_real_estate_dataset.csv",
"uk_real_estate_dataset_with_revenue (1).csv",
"uk_real_estate_dataset.csv",
]
CENTRAL_DISTRICTS = {
"Kensington","Chelsea","Islington","Camden","Hackney","Westminster",
"Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch"
}
def _find_data_file() -> Optional[Path]:
for p in DATA_CANDIDATES:
path = Path(p)
if path.exists():
return path
return None
def load_raw_data() -> pd.DataFrame:
path = _find_data_file()
if path is None:
now_year = datetime.now().year
return pd.DataFrame({
"Property_ID": list(range(1, 6)),
"Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000],
"Square_Footage": [950, 1200, 1600, 2200, 800],
"Bedrooms": [2, 3, 3, 4, 2],
"Bathrooms": [1, 2, 2, 3, 1],
"Year_Built": [1998, 2005, 2012, 1980, 2018],
"Quality_Score": [6, 7, 8, 7, 6],
"Location_City": ["London","London","Manchester","London","Bristol"],
"Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"],
"Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"],
"Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"),
})
df = pd.read_csv(path)
if "Sale_Date" in df.columns:
df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce")
dt = df["Sale_Date"]
elif "Listing_Date" in df.columns:
df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce")
dt = df["Listing_Date"]
else:
dt = None
if dt is not None:
df["Year"] = dt.dt.year
else:
df["Year"] = datetime.now().year
return df
def enrich_data(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
now_year = datetime.now().year
for col, default in [
("Square_Footage", np.nan),
("Bedrooms", 0),
("Bathrooms", 0),
("Year_Built", now_year),
("Quality_Score", 6),
("Location_City", "London"),
("Location_District", "Westminster"),
("Property_Type", "Townhouse"),
("Sale_Price_GBP", np.nan),
]:
if col not in df.columns:
df[col] = default
df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan)
df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median())
df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0)
df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1)
df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"]
df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int)
df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int)
df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int)
df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int)
return df