from pathlib import Path from datetime import datetime from typing import Optional import numpy as np import pandas as pd DATA_CANDIDATES = [ "data/uk_real_estate_dataset_with_revenue (1).csv", "data/uk_real_estate_dataset_with_revenue.csv", "data/uk_real_estate_dataset.csv", "uk_real_estate_dataset_with_revenue (1).csv", "uk_real_estate_dataset.csv", ] CENTRAL_DISTRICTS = { "Kensington","Chelsea","Islington","Camden","Hackney","Westminster", "Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch" } def _find_data_file() -> Optional[Path]: for p in DATA_CANDIDATES: path = Path(p) if path.exists(): return path return None def load_raw_data() -> pd.DataFrame: path = _find_data_file() if path is None: now_year = datetime.now().year return pd.DataFrame({ "Property_ID": list(range(1, 6)), "Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000], "Square_Footage": [950, 1200, 1600, 2200, 800], "Bedrooms": [2, 3, 3, 4, 2], "Bathrooms": [1, 2, 2, 3, 1], "Year_Built": [1998, 2005, 2012, 1980, 2018], "Quality_Score": [6, 7, 8, 7, 6], "Location_City": ["London","London","Manchester","London","Bristol"], "Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"], "Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"], "Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"), }) df = pd.read_csv(path) if "Sale_Date" in df.columns: df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce") dt = df["Sale_Date"] elif "Listing_Date" in df.columns: df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce") dt = df["Listing_Date"] else: dt = None if dt is not None: df["Year"] = dt.dt.year else: df["Year"] = datetime.now().year return df def enrich_data(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() now_year = datetime.now().year for col, default in [ ("Square_Footage", np.nan), ("Bedrooms", 0), ("Bathrooms", 0), ("Year_Built", now_year), ("Quality_Score", 6), ("Location_City", "London"), ("Location_District", "Westminster"), ("Property_Type", "Townhouse"), ("Sale_Price_GBP", np.nan), ]: if col not in df.columns: df[col] = default df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan) df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median()) df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0) df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1) df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"] df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int) df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int) df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int) df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int) return df