Spaces:
No application file
No application file
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Optional | |
| import numpy as np | |
| import pandas as pd | |
| DATA_CANDIDATES = [ | |
| "data/uk_real_estate_dataset_with_revenue (1).csv", | |
| "data/uk_real_estate_dataset_with_revenue.csv", | |
| "data/uk_real_estate_dataset.csv", | |
| "uk_real_estate_dataset_with_revenue (1).csv", | |
| "uk_real_estate_dataset.csv", | |
| ] | |
| CENTRAL_DISTRICTS = { | |
| "Kensington","Chelsea","Islington","Camden","Hackney","Westminster", | |
| "Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch" | |
| } | |
| def _find_data_file() -> Optional[Path]: | |
| for p in DATA_CANDIDATES: | |
| path = Path(p) | |
| if path.exists(): | |
| return path | |
| return None | |
| def load_raw_data() -> pd.DataFrame: | |
| path = _find_data_file() | |
| if path is None: | |
| now_year = datetime.now().year | |
| return pd.DataFrame({ | |
| "Property_ID": list(range(1, 6)), | |
| "Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000], | |
| "Square_Footage": [950, 1200, 1600, 2200, 800], | |
| "Bedrooms": [2, 3, 3, 4, 2], | |
| "Bathrooms": [1, 2, 2, 3, 1], | |
| "Year_Built": [1998, 2005, 2012, 1980, 2018], | |
| "Quality_Score": [6, 7, 8, 7, 6], | |
| "Location_City": ["London","London","Manchester","London","Bristol"], | |
| "Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"], | |
| "Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"], | |
| "Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"), | |
| }) | |
| df = pd.read_csv(path) | |
| if "Sale_Date" in df.columns: | |
| df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce") | |
| dt = df["Sale_Date"] | |
| elif "Listing_Date" in df.columns: | |
| df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce") | |
| dt = df["Listing_Date"] | |
| else: | |
| dt = None | |
| if dt is not None: | |
| df["Year"] = dt.dt.year | |
| else: | |
| df["Year"] = datetime.now().year | |
| return df | |
| def enrich_data(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| now_year = datetime.now().year | |
| for col, default in [ | |
| ("Square_Footage", np.nan), | |
| ("Bedrooms", 0), | |
| ("Bathrooms", 0), | |
| ("Year_Built", now_year), | |
| ("Quality_Score", 6), | |
| ("Location_City", "London"), | |
| ("Location_District", "Westminster"), | |
| ("Property_Type", "Townhouse"), | |
| ("Sale_Price_GBP", np.nan), | |
| ]: | |
| if col not in df.columns: | |
| df[col] = default | |
| df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan) | |
| df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median()) | |
| df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0) | |
| df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1) | |
| df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"] | |
| df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int) | |
| df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int) | |
| df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int) | |
| df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int) | |
| return df | |