| from pathlib import Path |
| from datetime import datetime |
| from typing import Optional |
| import numpy as np |
| import pandas as pd |
|
|
| DATA_CANDIDATES = [ |
| "data/uk_real_estate_dataset_with_revenue (1).csv", |
| "data/uk_real_estate_dataset_with_revenue.csv", |
| "data/uk_real_estate_dataset.csv", |
| "uk_real_estate_dataset_with_revenue.csv", |
| "uk_real_estate_dataset.csv", |
| ] |
|
|
| CENTRAL_DISTRICTS = { |
| "Kensington","Chelsea","Islington","Camden","Hackney","Westminster", |
| "Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch" |
| } |
|
|
| def _find_data_file() -> Optional[Path]: |
| for p in DATA_CANDIDATES: |
| path = Path(p) |
| if path.exists(): |
| return path |
| return None |
|
|
| def load_raw_data() -> pd.DataFrame: |
| path = _find_data_file() |
| if path is None: |
| now_year = datetime.now().year |
| return pd.DataFrame({ |
| "Property_ID": list(range(1, 6)), |
| "Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000], |
| "Square_Footage": [950, 1200, 1600, 2200, 800], |
| "Bedrooms": [2, 3, 3, 4, 2], |
| "Bathrooms": [1, 2, 2, 3, 1], |
| "Year_Built": [1998, 2005, 2012, 1980, 2018], |
| "Quality_Score": [6, 7, 8, 7, 6], |
| "Location_City": ["London","London","Manchester","London","Bristol"], |
| "Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"], |
| "Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"], |
| "Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"), |
| }) |
|
|
| df = pd.read_csv(path) |
| if "Sale_Date" in df.columns: |
| df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce") |
| dt = df["Sale_Date"] |
| elif "Listing_Date" in df.columns: |
| df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce") |
| dt = df["Listing_Date"] |
| else: |
| dt = None |
|
|
| if dt is not None: |
| df["Year"] = dt.dt.year |
| else: |
| df["Year"] = datetime.now().year |
|
|
| return df |
|
|
| def enrich_data(df: pd.DataFrame) -> pd.DataFrame: |
| df = df.copy() |
| now_year = datetime.now().year |
|
|
| for col, default in [ |
| ("Square_Footage", np.nan), |
| ("Bedrooms", 0), |
| ("Bathrooms", 0), |
| ("Year_Built", now_year), |
| ("Quality_Score", 6), |
| ("Location_City", "London"), |
| ("Location_District", "Westminster"), |
| ("Property_Type", "Townhouse"), |
| ("Sale_Price_GBP", np.nan), |
| ]: |
| if col not in df.columns: |
| df[col] = default |
|
|
| df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan) |
| df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median()) |
|
|
| df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0) |
| df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1) |
| df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"] |
|
|
| df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int) |
| df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int) |
|
|
| df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int) |
| df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int) |
|
|
| return df |
|
|