import pandas as pd from dataclasses import dataclass from pathlib import Path from src.features.construction_age_band_sap import normalize_construction_age_band, windows_feature_engineering_vectorised from src.features.energy_system import energy_system_feature_engineering_vectorised from src.features.floor import floor_feature_engineering_fast from src.features.walls import wall_feature_engineering from src.features.roofs import roof_feature_engineering def build_age_band_lookup(series: pd.Series): """ Build lookup dict: raw EPC CONSTRUCTION_AGE_BAND -> (sap_band_letter, sap_band_label) """ unique_vals = series.dropna().unique() lookup = {} for v in unique_vals: letter, label = normalize_construction_age_band(v) lookup[v] = (letter, label) return lookup def age_band_to_sap_letter(df: pd.DataFrame)-> pd.DataFrame: df = df.copy() lookup = build_age_band_lookup(df["CONSTRUCTION_AGE_BAND"]) age_df = ( pd.DataFrame.from_dict( lookup, orient="index", columns=["sap_band_letter", "sap_band_label"] ) ) df = df.join(age_df, on="CONSTRUCTION_AGE_BAND") return df EFF_MAP = { "very poor": 0.60, "poor": 0.68, "average": 0.75, "good": 0.85, "very good": 0.92 } DHW_EFF_MAP = { "very poor": 0.65, "poor": 0.72, "average": 0.78, "good": 0.85, "very good": 0.90 } energy_system_columns = [ "MAIN_HEATING_SYSTEM","SECONDARY_HEATING_SYSTEM", "MAIN_FUEL_TYPE","DHW_SUPPLY_SYSTEM","VENTILATION_SYSTEM", "LIGHTING_FRACTION_LOW_ENERGY","PV_KWP","MAINHEAT_EFF_NUM","ROOF_MM_S9", "HOT_WATER_ENERGY_NUM" ] envelop_columns = [ "FLOOR_U_VALUE","FLOOR_INSULATION_TYPE","FLOOR_BOUNDARY_TYPE", "WALL_U_VALUE","WALL_TYPE","WALL_INSULATION_MODEL", "ROOF_U_VALUE","ROOF_CLASS","ROOF_INSULATION_TYPE", "glazing_area_m2","glazing_type" ] general_details = [ "PROPERTY_TYPE","TOTAL_FLOOR_AREA", "BUILT_FORM","sap_band_letter","FLOOR_HEIGHT" ] features = energy_system_columns + envelop_columns + general_details cat_cols = [ "MAIN_HEATING_SYSTEM","SECONDARY_HEATING_SYSTEM", "MAIN_FUEL_TYPE","DHW_SUPPLY_SYSTEM","VENTILATION_SYSTEM", "FLOOR_INSULATION_TYPE","FLOOR_BOUNDARY_TYPE", "WALL_TYPE","WALL_INSULATION_MODEL", "ROOF_CLASS","ROOF_INSULATION_TYPE", "glazing_type", "PROPERTY_TYPE","BUILT_FORM","sap_band_letter" ] @dataclass class SAPTables: s3: pd.DataFrame walls_u: pd.DataFrame s9: pd.DataFrame s10: pd.DataFrame @classmethod def from_local_dir(cls, base_dir: str) -> "SAPTables": base = Path(base_dir) return cls( s3=pd.read_excel(base / "S3_sap.xlsx"), walls_u=pd.read_excel(base / "external_wall_u_values2.xlsx"), s9=pd.read_excel(base / "SAP_Table_ROOF_S9.xlsx"), s10=pd.read_excel(base / "SAP_Table_ROOF_S10.xlsx"), ) class EPCFeatureEngineer: def __init__(self, sap: SAPTables): self.sap = sap def transform(self, df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df.replace("", pd.NA, inplace=True) df["FLOOR_HEIGHT"] = df["FLOOR_HEIGHT"].fillna(2.5) # SAP age bands df = age_band_to_sap_letter(df) # Envelope df = windows_feature_engineering_vectorised(df) df = energy_system_feature_engineering_vectorised(df) df = floor_feature_engineering_fast(df, self.sap.s3) df = wall_feature_engineering(df, self.sap.walls_u) df = roof_feature_engineering(df, self.sap.s9, self.sap.s10) # Heating efficiency df["MAINHEAT_EFF_NUM"] = ( df["MAINHEAT_ENERGY_EFF"] .str.lower() .map(EFF_MAP) .fillna(0.75) ) # Hot water efficiency df["HOT_WATER_ENERGY_NUM"] = ( df["HOT_WATER_ENERGY_EFF"] .str.lower() .map(DHW_EFF_MAP) ) df.loc[ df["HOT_WATER_ENERGY_NUM"].isna() & df["DHW_SUPPLY_SYSTEM"].notna(), "HOT_WATER_ENERGY_NUM" ] = 0.78 df["HOT_WATER_ENERGY_NUM"] = df["HOT_WATER_ENERGY_NUM"].fillna(0.75) # Categoricals df[cat_cols] = df[cat_cols].fillna("UNKNOWN").astype(str) return df[features]