File size: 6,318 Bytes
ffdb9be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# --- Imports ---
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic
from sklearn.neighbors import BallTree
import pandas as pd

# --- Constants ---
CITY_CENTER = (51.5072, -0.1276)  # London
EPSG = "EPSG:4326"


class LondonPropertyGeoFeatures:
    """Extract London property geo features for model inference."""

    def __init__(self, geo_dir):
        self.CITY_CENTER = CITY_CENTER
        self.EPSG = EPSG
        self.geo_dir = geo_dir
        self.load_datasets()
        self.prepare_station_tree()

    def load_datasets(self):
        """Load and prepare geographic datasets."""
        self.london_boundaries = gpd.read_file(f"{self.geo_dir}/london_boroughs.geojson").to_crs(self.EPSG)
        self.hex_gdf = gpd.read_parquet(f"{self.geo_dir}/noize.parquet").to_crs(self.EPSG)
        self.zone_fares = gpd.read_parquet(f"{self.geo_dir}/zone_fares.parquet").to_crs(self.EPSG)
        self.stations = gpd.read_parquet(f"{self.geo_dir}/rail_tfl.parquet").to_crs(self.EPSG)


    def prepare_station_tree(self):
        """Prepare BallTree for fast station distance queries."""
        # Convert stations to UTM for accurate distance calculations
        self.stations_utm = self.stations.to_crs(self.stations.estimate_utm_crs())
        station_coords = np.array([[p.x, p.y] for p in self.stations_utm.geometry])
        self.station_tree = BallTree(station_coords, leaf_size=15, metric='euclidean')
        self.station_names = self.stations_utm['CommonName'].values
        self.station_tfl = self.stations_utm['TFL'].values
        self.station_rail = self.stations_utm['RAIL'].values

    def _create_point_gdf(self, lat, lon):
        """Create a GeoDataFrame for the point (internal helper)."""
        point = Point(lon, lat)
        return gpd.GeoDataFrame(geometry=[point], crs=self.EPSG)

    def borough(self, lat, lon):
        """Return the London borough name containing the given coordinates."""
        prop_gdf = self._create_point_gdf(lat, lon)
        joined = gpd.sjoin(prop_gdf, self.london_boundaries, how="left", predicate="within")
        return joined.iloc[0].get("name", None)

    def compute_angle(self, lat, lon):
        """Compute angle (in radians) of a point relative to London center."""
        lat1, lon1 = np.radians(self.CITY_CENTER[0]), np.radians(self.CITY_CENTER[1])
        lat2, lon2 = np.radians(lat), np.radians(lon)

        dlon = lon2 - lon1
        x = np.cos(lat2) * np.sin(dlon)
        y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
        return np.arctan2(x, y)

    def distance_to_center(self, lat, lon):
        """Return distance from city center (in miles)."""
        return geodesic((lat, lon), self.CITY_CENTER).miles

    def noize_class(self, lat, lon):
        """Return noise class for given coordinates."""
        prop_gdf = self._create_point_gdf(lat, lon)
        joined = gpd.sjoin(prop_gdf, self.hex_gdf, how="left", predicate="within")
        return joined.iloc[0].get("NoiseClass", None)

    def zone_fare(self, lat, lon):
        """Return transport fare zone for given coordinates."""
        prop_gdf = self._create_point_gdf(lat, lon)
        joined = gpd.sjoin(prop_gdf, self.zone_fares, how="left", predicate="within")
        zone_name = joined.iloc[0].get("Name", None)
        # Extract just the zone number if format is "Zone X"
        if zone_name and "Zone" in zone_name:
            return zone_name.split(" ")[-1]
        return zone_name

    def find_nearest_stations(self, lat, lon, k=3, max_distance_meters=50000):
        """
        Find k nearest stations with distances and TFL/RAIL flags.
        Returns distances in miles.
        """
        prop_gdf = self._create_point_gdf(lat, lon)
        prop_utm = prop_gdf.to_crs(self.stations_utm.crs)

        # Query the BallTree
        prop_coords = np.array([[p.x, p.y] for p in prop_utm.geometry])
        distances_m, indices = self.station_tree.query(prop_coords, k=k)

        results = []
        for dist_m, idx in zip(distances_m[0], indices[0]):
            if dist_m <= max_distance_meters:
                station_data = {
                    'distance_miles': dist_m / 1609.34,
                    'name': self.station_names[idx],
                    'TFL': bool(self.station_tfl[idx]),
                    'RAIL': bool(self.station_rail[idx])
                }
                results.append(station_data)

        return results


    def extract_geo_features(self, lat, lon):
        """
        Extract all GEO features for model inference in the required format.
        """
        # Geographic features
        borough_name = self.borough(lat, lon)
        angle = self.compute_angle(lat, lon)
        center_distance = self.distance_to_center(lat, lon)
        noise_class = self.noize_class(lat, lon)
        zone = self.zone_fare(lat, lon)

        # Station features
        nearest_stations = self.find_nearest_stations(lat, lon, k=3)

        # Prepare station features with proper naming
        station_features = {}
        for i, station in enumerate(nearest_stations[:3], 1):
            station_features[f'distance_to_station{i}'] = round(station['distance_miles'], 6)
            station_features[f'TFL{i}'] = station['TFL']
            station_features[f'RAIL{i}'] = station['RAIL']

        # Fill missing stations with default values
        for i in range(len(nearest_stations) + 1, 4):
            station_features[f'distance_to_station{i}'] = None
            station_features[f'TFL{i}'] = False
            station_features[f'RAIL{i}'] = False

        geo_features = {
                "distance_to_center": round(center_distance, 6),
                "angle_from_center": round(angle, 6),
                "zone": zone,
                "borough": borough_name,
                "NoiseClass": noise_class,
                **station_features
            }

        return geo_features


    def add_features_to_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Vectorized feature extraction for a full DataFrame."""
        features = df.apply(
            lambda row: pd.Series(self.extract_geo_features(row["latitude"], row["longitude"])),
            axis=1
        )
        return pd.concat([df, features], axis=1)