File size: 3,442 Bytes
3512a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from pathlib import Path
from datetime import datetime
from typing import Optional
import numpy as np
import pandas as pd

DATA_CANDIDATES = [
    "data/uk_real_estate_dataset_with_revenue (1).csv",
    "data/uk_real_estate_dataset_with_revenue.csv",
    "data/uk_real_estate_dataset.csv",
    "uk_real_estate_dataset_with_revenue (1).csv",
    "uk_real_estate_dataset.csv",
]

CENTRAL_DISTRICTS = {
    "Kensington","Chelsea","Islington","Camden","Hackney","Westminster",
    "Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch"
}

def _find_data_file() -> Optional[Path]:
    for p in DATA_CANDIDATES:
        path = Path(p)
        if path.exists():
            return path
    return None

def load_raw_data() -> pd.DataFrame:
    path = _find_data_file()
    if path is None:
        now_year = datetime.now().year
        return pd.DataFrame({
            "Property_ID": list(range(1, 6)),
            "Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000],
            "Square_Footage": [950, 1200, 1600, 2200, 800],
            "Bedrooms": [2, 3, 3, 4, 2],
            "Bathrooms": [1, 2, 2, 3, 1],
            "Year_Built": [1998, 2005, 2012, 1980, 2018],
            "Quality_Score": [6, 7, 8, 7, 6],
            "Location_City": ["London","London","Manchester","London","Bristol"],
            "Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"],
            "Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"],
            "Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"),
        })

    df = pd.read_csv(path)
    if "Sale_Date" in df.columns:
        df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce")
        dt = df["Sale_Date"]
    elif "Listing_Date" in df.columns:
        df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce")
        dt = df["Listing_Date"]
    else:
        dt = None

    if dt is not None:
        df["Year"] = dt.dt.year
    else:
        df["Year"] = datetime.now().year

    return df

def enrich_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    now_year = datetime.now().year

    for col, default in [
        ("Square_Footage", np.nan),
        ("Bedrooms", 0),
        ("Bathrooms", 0),
        ("Year_Built", now_year),
        ("Quality_Score", 6),
        ("Location_City", "London"),
        ("Location_District", "Westminster"),
        ("Property_Type", "Townhouse"),
        ("Sale_Price_GBP", np.nan),
    ]:
        if col not in df.columns:
            df[col] = default

    df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan)
    df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median())

    df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0)
    df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1)
    df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"]

    df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int)
    df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int)

    df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int)
    df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int)

    return df