Spaces:

seanerons
/

uk_housing

No application file

App Files Files Community

seanerons commited on Sep 7, 2025

Commit

3512a86

verified ·

1 Parent(s): c467986

Upload 7 files

Browse files

Files changed (7) hide show

app.py +223 -0
charts.py +18 -0
data_utils.py +93 -0
llm_chat.py +91 -0
ml_price_prediction.py +375 -0
plan_executor.py +59 -0
predictor.py +50 -0

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+Dashboard entry — UK Housing + Falcon Chat (Python ≤3.9)
+Run:  python dashboard.py
+ENV (optional):
+  FALCON_MODEL=tiiuae/falcon-1b-instruct  # default; swap to 7B on GPU
+  DEVICE_MAP=cpu                          # set to "auto" on Spaces GPU
+  MAX_NEW_TOKENS=320
+  PORT=8050
+"""
+import os
+from datetime import datetime
+import dash
+from dash import dcc, html, Input, Output, State
+import dash_bootstrap_components as dbc
+from dash import dash_table
+import plotly.express as px
+import pandas as pd
+from data_utils import load_raw_data, enrich_data
+from predictor import predict_price
+from charts import build_trend, build_distribution, build_city_bar, build_type_pie
+from llm_chat import plan_from_question, explain_answer
+from plan_executor import execute_plan
+# ---------- Data ----------
+RAW = load_raw_data()
+df = enrich_data(RAW)
+# ---------- Dash ----------
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+server = app.server
+# ===== Top: LLM Chat =====
+chat_card = dbc.Card([
+    dbc.CardHeader(html.H4("💬 Ask the Data (Falcon)")),
+    dbc.CardBody([
+        dbc.Row([
+            dbc.Col(dcc.Textarea(
+                id="chat-question",
+                placeholder="Ask anything about the dataset (e.g., 'Aside from London, with a £500k budget what can I afford?')",
+                style={"width": "100%", "height": "90px"}
+            ), md=9),
+            dbc.Col([
+                html.Label("Rows limit"),
+                dcc.Input(id="rows-limit", type="number", value=10, min=1, max=50, style={"width": "100%"}),
+                dbc.Button("Ask", id="ask-btn", color="primary", className="mt-3 w-100"),
+            ], md=3),
+        ], className="gy-2"),
+        html.Div(id="chat-answer", className="mt-3"),
+        dash_table.DataTable(id="chat-table", page_size=10, style_table={"overflowX": "auto"}),
+    ])
+], className="mb-4 shadow-sm")
+# ===== Predictor =====
+predictor_card = dbc.Card([
+    dbc.CardHeader(html.H4("🔮 Price Prediction Tool")),
+    dbc.CardBody([
+        dbc.Row([
+            dbc.Col([dbc.Label("Square Footage"), dbc.Input(id="sqft-in", type="number", value=1200)], width=3),
+            dbc.Col([dbc.Label("Bedrooms"), dbc.Input(id="beds-in", type="number", value=3)], width=3),
+            dbc.Col([dbc.Label("Bathrooms"), dbc.Input(id="baths-in", type="number", value=2)], width=3),
+            dbc.Col([dbc.Label("Year Built"), dbc.Input(id="year-in", type="number", value=2005)], width=3),
+        ], className="gy-2"),
+        dbc.Row([
+            dbc.Col([dbc.Label("City"), dcc.Dropdown(
+                id="city-dd",
+                options=[{"label": c, "value": c} for c in sorted(df["Location_City"].dropna().unique())],
+                value="London")], width=3),
+            dbc.Col([dbc.Label("Property Type"), dcc.Dropdown(
+                id="type-dd",
+                options=[{"label": p, "value": p} for p in sorted(df["Property_Type"].dropna().unique())],
+                value="Detached House")], width=3),
+            dbc.Col([dbc.Label("Quality (1–10)"), dcc.Slider(
+                id="quality-in", min=1, max=10, step=1, value=7,
+                marks={i: str(i) for i in range(1, 11)})], width=6),
+        ]),
+        dbc.Button("Predict Price", id="predict-btn", color="primary", className="mt-3"),
+        html.Div(id="prediction-output", className="h5 mt-3"),
+    ])
+], className="mb-4 shadow-sm")
+# ===== Summary Cards =====
+summary_cards = dbc.Row([
+    dbc.Col(dbc.Card(dbc.CardBody([html.H4(f"{len(df):,}", className="card-title text-primary"), html.P("Total Properties")])), width=3),
+    dbc.Col(dbc.Card(dbc.CardBody([html.H4(f"£{df['Sale_Price_GBP'].mean():,.0f}", className="card-title text-success"), html.P("Average Price")])), width=3),
+    dbc.Col(dbc.Card(dbc.CardBody([html.H4(f"{df['Location_City'].nunique()}", className="card-title text-info"), html.P("Cities")])), width=3),
+    dbc.Col(dbc.Card(dbc.CardBody([html.H4(f"{df['Property_Type'].nunique()}", className="card-title text-warning"), html.P("Property Types")])), width=3),
+], className="mb-4")
+# ===== Filters =====
+filters_row = dbc.Row([
+    dbc.Col([html.Label("Select City:"), dcc.Dropdown(
+        id="city-filter",
+        options=[{"label": c, "value": c} for c in sorted(df["Location_City"].dropna().unique())],
+        multi=True)], width=3),
+    dbc.Col([html.Label("Property Type:"), dcc.Dropdown(
+        id="type-filter",
+        options=[{"label": p, "value": p} for p in sorted(df["Property_Type"].dropna().unique())],
+        multi=True)], width=3),
+    dbc.Col([html.Label("Year Range:"), dcc.RangeSlider(
+        id="year-range",
+        min=int(df["Year"].min()), max=int(df["Year"].max()),
+        value=[int(df["Year"].min()), int(df["Year"].max())],
+        marks={str(y): str(y) for y in range(int(df["Year"].min()), int(df["Year"].max()) + 1, 2)},
+        step=1)], width=6),
+], className="mb-4")
+# ===== Charts =====
+charts_row_1 = dbc.Row([
+    dbc.Col(dcc.Graph(id="price-trend-chart"), width=6),
+    dbc.Col(dcc.Graph(id="price-distribution-chart"), width=6),
+], className="mb-4")
+charts_row_2 = dbc.Row([
+    dbc.Col(dcc.Graph(id="city-comparison-chart"), width=6),
+    dbc.Col(dcc.Graph(id="property-type-chart"), width=6),
+], className="mb-4")
+# ===== Layout =====
+app.layout = dbc.Container([
+    html.H2("🏠 UK Housing Market Analysis", className="mt-3 mb-2 text-center"),
+    chat_card,
+    predictor_card,
+    summary_cards,
+    filters_row,
+    charts_row_1,
+    charts_row_2,
+], fluid=True)
+# ---------- Callbacks: Charts ----------
+@app.callback(
+    [Output("price-trend-chart", "figure"),
+     Output("price-distribution-chart", "figure"),
+     Output("city-comparison-chart", "figure"),
+     Output("property-type-chart", "figure")],
+    [Input("city-filter", "value"),
+     Input("type-filter", "value"),
+     Input("year-range", "value")]
+)
+def update_charts(city_filter, type_filter, year_range):
+    filtered = df.copy()
+    if city_filter:
+        filtered = filtered[filtered["Location_City"].isin(city_filter)]
+    if type_filter:
+        filtered = filtered[filtered["Property_Type"].isin(type_filter)]
+    if year_range:
+        filtered = filtered[(filtered["Year"] >= year_range[0]) & (filtered["Year"] <= year_range[1])]
+    return (
+        build_trend(filtered),
+        build_distribution(filtered),
+        build_city_bar(filtered),
+        build_type_pie(filtered),
+    )
+# ---------- Callback: Predictor ----------
+@app.callback(
+    Output("prediction-output", "children"),
+    Input("predict-btn", "n_clicks"),
+    State("sqft-in", "value"),
+    State("beds-in", "value"),
+    State("baths-in", "value"),
+    State("year-in", "value"),
+    State("quality-in", "value"),
+    State("city-dd", "value"),
+    State("type-dd", "value"),
+    prevent_initial_call=True
+)
+def on_predict(n, sqft, beds, baths, year_built, quality, city, prop_type):
+    price, used_model = predict_price(
+        float(sqft or 0), int(beds or 0), int(baths or 0),
+        int(year_built or datetime.now().year), int(quality or 6),
+        city or "London", prop_type or "Townhouse"
+    )
+    note = "Predicted using trained model." if used_model else "Estimated using fallback."
+    return f"£{price:,.0f} ({note})"
+# ---------- Callback: LLM Chat ----------
+@app.callback(
+    [Output("chat-answer", "children"),
+     Output("chat-table", "data"),
+     Output("chat-table", "columns")],
+    Input("ask-btn", "n_clicks"),
+    State("chat-question", "value"),
+    State("rows-limit", "value"),
+    prevent_initial_call=True
+)
+def on_ask(n_clicks, question, limit):
+    if not question or not str(question).strip():
+        return "Please enter a question.", [], []
+    plan = plan_from_question(question, df)
+    if limit:
+        try:
+            plan["limit"] = int(limit)
+        except Exception:
+            pass
+    try:
+        result = execute_plan(df, plan)
+    except Exception as e:
+        return f"Sorry, I couldn't compute that: {e}", [], []
+    try:
+        answer = explain_answer(question, result)
+    except Exception:
+        answer = "Here are the results based on your query."
+    cols = [{"name": c, "id": c} for c in result.columns]
+    data = result.to_dict("records")
+    return answer, data, cols
+# ---------- Main ----------
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 8050))
+    app.run(host="0.0.0.0", port=port, debug=False)

charts.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import plotly.express as px
+import pandas as pd
+def build_trend(filtered: pd.DataFrame):
+    yearly = filtered.groupby("Year")["Sale_Price_GBP"].mean().reset_index()
+    return px.line(yearly, x="Year", y="Sale_Price_GBP", title="Average Prices Over Time")
+def build_distribution(filtered: pd.DataFrame):
+    return px.histogram(filtered, x="Sale_Price_GBP", nbins=50, title="Price Distribution")
+def build_city_bar(filtered: pd.DataFrame):
+    city_stats = filtered.groupby("Location_City")["Sale_Price_GBP"].mean().reset_index()
+    return px.bar(city_stats, x="Location_City", y="Sale_Price_GBP", title="Average Price by City")
+def build_type_pie(filtered: pd.DataFrame):
+    type_stats = filtered["Property_Type"].value_counts().reset_index()
+    type_stats.columns = ["Property_Type", "count"]
+    return px.pie(type_stats, values="count", names="Property_Type", title="Property Type Distribution")

data_utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from pathlib import Path
+from datetime import datetime
+from typing import Optional
+import numpy as np
+import pandas as pd
+DATA_CANDIDATES = [
+    "data/uk_real_estate_dataset_with_revenue (1).csv",
+    "data/uk_real_estate_dataset_with_revenue.csv",
+    "data/uk_real_estate_dataset.csv",
+    "uk_real_estate_dataset_with_revenue (1).csv",
+    "uk_real_estate_dataset.csv",
+]
+CENTRAL_DISTRICTS = {
+    "Kensington","Chelsea","Islington","Camden","Hackney","Westminster",
+    "Southwark","Lambeth","Hammersmith","Fulham","Tower Hamlets","Brixton","Shoreditch"
+}
+def _find_data_file() -> Optional[Path]:
+    for p in DATA_CANDIDATES:
+        path = Path(p)
+        if path.exists():
+            return path
+    return None
+def load_raw_data() -> pd.DataFrame:
+    path = _find_data_file()
+    if path is None:
+        now_year = datetime.now().year
+        return pd.DataFrame({
+            "Property_ID": list(range(1, 6)),
+            "Sale_Price_GBP": [500000, 650000, 825000, 1200000, 430000],
+            "Square_Footage": [950, 1200, 1600, 2200, 800],
+            "Bedrooms": [2, 3, 3, 4, 2],
+            "Bathrooms": [1, 2, 2, 3, 1],
+            "Year_Built": [1998, 2005, 2012, 1980, 2018],
+            "Quality_Score": [6, 7, 8, 7, 6],
+            "Location_City": ["London","London","Manchester","London","Bristol"],
+            "Location_District": ["Islington","Camden","Didsbury","Kensington","Clifton"],
+            "Property_Type": ["Townhouse","Detached House","Detached House","Townhouse","Townhouse"],
+            "Sale_Date": pd.date_range(str(now_year-1) + "-01-01", periods=5, freq="90D"),
+        })
+    df = pd.read_csv(path)
+    if "Sale_Date" in df.columns:
+        df["Sale_Date"] = pd.to_datetime(df["Sale_Date"], errors="coerce")
+        dt = df["Sale_Date"]
+    elif "Listing_Date" in df.columns:
+        df["Listing_Date"] = pd.to_datetime(df["Listing_Date"], errors="coerce")
+        dt = df["Listing_Date"]
+    else:
+        dt = None
+    if dt is not None:
+        df["Year"] = dt.dt.year
+    else:
+        df["Year"] = datetime.now().year
+    return df
+def enrich_data(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    now_year = datetime.now().year
+    for col, default in [
+        ("Square_Footage", np.nan),
+        ("Bedrooms", 0),
+        ("Bathrooms", 0),
+        ("Year_Built", now_year),
+        ("Quality_Score", 6),
+        ("Location_City", "London"),
+        ("Location_District", "Westminster"),
+        ("Property_Type", "Townhouse"),
+        ("Sale_Price_GBP", np.nan),
+    ]:
+        if col not in df.columns:
+            df[col] = default
+    df["Price_Per_Sqft"] = df["Sale_Price_GBP"] / df["Square_Footage"].replace(0, np.nan)
+    df["Price_Per_Sqft"] = df["Price_Per_Sqft"].fillna(df["Price_Per_Sqft"].median())
+    df["Property_Age"] = (df["Year"] - df["Year_Built"]).clip(lower=0)
+    df["Total_Rooms"] = (df["Bedrooms"] + df["Bathrooms"]).replace(0, np.nan).fillna(1)
+    df["Size_Per_Room"] = df["Square_Footage"] / df["Total_Rooms"]
+    df["Is_London"] = (df["Location_City"].astype(str) == "London").astype(int)
+    df["Is_Central_London"] = df["Location_District"].isin(CENTRAL_DISTRICTS).astype(int)
+    df["Is_Detached"] = (df["Property_Type"] == "Detached House").astype(int)
+    df["Is_Townhouse"] = (df["Property_Type"] == "Townhouse").astype(int)
+    return df

llm_chat.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os, re, json
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+MODEL_ID = os.environ.get("FALCON_MODEL", "tiiuae/Falcon-H1-0.5B-Instruct")
+DEVICE_MAP = os.environ.get("DEVICE_MAP", "cpu")     # set "auto" on GPU Spaces
+MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "320"))
+_tok = None
+_model = None
+_llm = None
+def _get_llm():
+    """Lazy-load Falcon on first use so the Dash UI can start immediately."""
+    global _tok, _model, _llm
+    if _llm is None:
+        print(f">>> Loading Falcon model: {MODEL_ID} (device_map={DEVICE_MAP})")
+        _tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+        _model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=None, device_map=DEVICE_MAP)
+        _llm = pipeline(
+            "text-generation",
+            model=_model,
+            tokenizer=_tok,
+            do_sample=False,
+            temperature=0.0,
+            max_new_tokens=MAX_NEW_TOKENS,
+            pad_token_id=_tok.eos_token_id,
+            return_full_text=False,
+        )
+        print(">>> Falcon ready.")
+    return _llm
+SYSTEM_PLANNER = (
+    "You translate user questions about a housing CSV into a STRICT JSON plan.\n"
+    "Allowed keys: task, filters, groupby, metrics, sort_by, limit.\n"
+    "Filters support eq, in, not_in, gte, lte. Metrics: mean, median, count.\n"
+    "Only use known columns and metrics. No code. Respond with JSON only."
+)
+def _schema_from_df(df: pd.DataFrame):
+    cols = list(df.columns)
+    categoricals = [c for c in ["Location_City", "Property_Type", "Location_District"] if c in df.columns]
+    numerics = [c for c in ["Sale_Price_GBP", "Square_Footage", "Bedrooms", "Bathrooms", "Year", "Price_Per_Sqft"] if c in df.columns]
+    return cols, categoricals, numerics
+def plan_from_question(question: str, df: pd.DataFrame) -> dict:
+    llm = _get_llm()
+    cols, cats, nums = _schema_from_df(df)
+    fewshot = (
+        "Columns: %s\nCategoricals: %s\nNumerics: %s\nAllowed metrics: ['mean','median','count']\n"
+        "Example Q: Aside from London, with a budget of £500,000, which place and property can I afford?\n"
+        "Example JSON: {\"task\":\"affordability\",\"filters\":{\"Sale_Price_GBP\":{\"lte\":500000},"
+        "\"Location_City\":{\"not_in\":[\"London\"]}},\"groupby\":[\"Location_City\",\"Property_Type\"],"
+        "\"metrics\":[{\"col\":\"Sale_Price_GBP\",\"op\":\"mean\",\"label\":\"avg_price\"}],"
+        "\"sort_by\":[{\"col\":\"avg_price\",\"asc\":true}],\"limit\":10}"
+    ) % (cols, cats, nums)
+    prompt = SYSTEM_PLANNER + "\n\n" + fewshot + "\n\nUser question: " + question + "\nJSON:"
+    out = llm(prompt)[0]["generated_text"].strip()
+    m = re.search(r"\{[\s\S]*\}$", out)
+    if not m:
+        return {
+            "task": "fallback_top_cities",
+            "groupby": ["Location_City"],
+            "metrics": [{"col": "Sale_Price_GBP", "op": "mean", "label": "avg_price"}],
+            "sort_by": [{"col": "avg_price", "asc": False}],
+            "limit": 10,
+        }
+    try:
+        return json.loads(m.group(0))
+    except Exception:
+        return {
+            "task": "fallback_top_cities",
+            "groupby": ["Location_City"],
+            "metrics": [{"col": "Sale_Price_GBP", "op": "mean", "label": "avg_price"}],
+            "sort_by": [{"col": "avg_price", "asc": False}],
+            "limit": 10,
+        }
+EXPLAIN_SYS = (
+    "You are a concise analyst. Given a small results table and the user's question, "
+    "write 2–5 short sentences with the main takeaway. Mention key filters (e.g., budget, city)."
+)
+def explain_answer(question: str, table: pd.DataFrame) -> str:
+    llm = _get_llm()
+    preview = table.to_csv(index=False)
+    prompt = EXPLAIN_SYS + "\n\nQuestion: " + question + "\nTable:\n" + preview + "\n\nAnswer:"
+    out = llm(prompt)[0]["generated_text"].strip()
+    return out

ml_price_prediction.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+Machine Learning Models for UK Housing Price Prediction
+Multiple algorithms comparison and optimization
+"""
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
+from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+from sklearn.linear_model import LinearRegression, Ridge, Lasso
+from sklearn.svm import SVR
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+import warnings
+warnings.filterwarnings('ignore')
+class HousingPricePredictor:
+    def __init__(self, data_path):
+        """Initialize the predictor with the dataset"""
+        self.data = pd.read_csv(data_path)
+        self.prepare_data()
+        self.models = {}
+        self.results = {}
+    def prepare_data(self):
+        """Clean and prepare the data for machine learning"""
+        print("Preparing data for machine learning...")
+        # Convert date column
+        self.data['Listing_Date'] = pd.to_datetime(self.data['Listing_Date'])
+        self.data['Year'] = self.data['Listing_Date'].dt.year
+        self.data['Month'] = self.data['Listing_Date'].dt.month
+        self.data['Quarter'] = self.data['Listing_Date'].dt.quarter
+        # Create derived features
+        self.data['Price_Per_Sqft'] = self.data['Sale_Price_GBP'] / self.data['Square_Footage']
+        self.data['Property_Age'] = self.data['Year'] - self.data['Year_Built']
+        self.data['Total_Rooms'] = self.data['Bedrooms'] + self.data['Bathrooms']
+        self.data['Size_Per_Room'] = self.data['Square_Footage'] / self.data['Total_Rooms']
+        # Handle missing values
+        self.data['Nearby_Amenities_Score'] = self.data['Nearby_Amenities_Score'].fillna(
+            self.data['Nearby_Amenities_Score'].median()
+        )
+        # Create location-based features
+        self.data['Is_London'] = (self.data['Location_City'] == 'London').astype(int)
+        self.data['Is_Central_London'] = self.data['Location_District'].isin([
+            'Kensington', 'Chelsea', 'Islington', 'Camden', 'Hackney', 'Brixton', 'Shoreditch'
+        ]).astype(int)
+        # Create property type dummies
+        self.data['Is_Detached'] = (self.data['Property_Type'] == 'Detached House').astype(int)
+        self.data['Is_Townhouse'] = (self.data['Property_Type'] == 'Townhouse').astype(int)
+        print(f"Dataset prepared: {self.data.shape}")
+    def prepare_features(self):
+        """Prepare features for machine learning"""
+        # Select features for modeling
+        feature_columns = [
+            'Square_Footage', 'Bedrooms', 'Bathrooms', 'Year_Built', 'Property_Age',
+            'Build_Quality_Rating', 'Nearby_Amenities_Score', 'Market_Trend_Index',
+            'Days_On_Market', 'Agent_Commission_Percentage', 'Year', 'Month', 'Quarter',
+            'Total_Rooms', 'Size_Per_Room', 'Is_London', 'Is_Central_London',
+            'Is_Detached', 'Is_Townhouse'
+        ]
+        # Categorical features for encoding
+        categorical_features = ['Location_City', 'Location_District', 'Property_Type']
+        # Prepare X and y
+        X_numeric = self.data[feature_columns]
+        X_categorical = self.data[categorical_features]
+        y = self.data['Sale_Price_GBP']
+        # Encode categorical variables
+        le_city = LabelEncoder()
+        le_district = LabelEncoder()
+        le_type = LabelEncoder()
+        X_categorical_encoded = pd.DataFrame({
+            'Location_City': le_city.fit_transform(X_categorical['Location_City']),
+            'Location_District': le_district.fit_transform(X_categorical['Location_District']),
+            'Property_Type': le_type.fit_transform(X_categorical['Property_Type'])
+        })
+        # Combine all features
+        X = pd.concat([X_numeric, X_categorical_encoded], axis=1)
+        # Store encoders for later use
+        self.encoders = {
+            'city': le_city,
+            'district': le_district,
+            'type': le_type
+        }
+        return X, y
+    def train_models(self):
+        """Train multiple machine learning models"""
+        print("\nTraining machine learning models...")
+        X, y = self.prepare_features()
+        # Split the data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        # Store split data
+        self.X_train, self.X_test = X_train, X_test
+        self.y_train, self.y_test = y_train, y_test
+        # Scale the features
+        scaler = StandardScaler()
+        X_train_scaled = scaler.fit_transform(X_train)
+        X_test_scaled = scaler.transform(X_test)
+        self.scaler = scaler
+        # Define models
+        models = {
+            'Linear Regression': LinearRegression(),
+            'Ridge Regression': Ridge(alpha=1.0),
+            'Lasso Regression': Lasso(alpha=0.1),
+            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
+            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
+            'SVR': SVR(kernel='rbf', C=1.0, gamma='scale')
+        }
+        # Train and evaluate models
+        for name, model in models.items():
+            print(f"Training {name}...")
+            # Use scaled data for models that benefit from it
+            if name in ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'SVR']:
+                model.fit(X_train_scaled, y_train)
+                y_pred = model.predict(X_test_scaled)
+            else:
+                model.fit(X_train, y_train)
+                y_pred = model.predict(X_test)
+            # Calculate metrics
+            mae = mean_absolute_error(y_test, y_pred)
+            mse = mean_squared_error(y_test, y_pred)
+            rmse = np.sqrt(mse)
+            r2 = r2_score(y_test, y_pred)
+            # Store results
+            self.models[name] = model
+            self.results[name] = {
+                'MAE': mae,
+                'MSE': mse,
+                'RMSE': rmse,
+                'R2': r2,
+                'predictions': y_pred
+            }
+            print(f"  MAE: £{mae:,.2f}")
+            print(f"  RMSE: £{rmse:,.2f}")
+            print(f"  R²: {r2:.4f}")
+            print()
+    def optimize_best_model(self):
+        """Optimize the best performing model using GridSearchCV"""
+        print("Optimizing the best model...")
+        # Find best model based on R² score
+        best_model_name = max(self.results.keys(), key=lambda x: self.results[x]['R2'])
+        print(f"Best model: {best_model_name} (R² = {self.results[best_model_name]['R2']:.4f})")
+        if best_model_name == 'Random Forest':
+            # Optimize Random Forest
+            param_grid = {
+                'n_estimators': [100, 200, 300],
+                'max_depth': [10, 20, None],
+                'min_samples_split': [2, 5, 10],
+                'min_samples_leaf': [1, 2, 4]
+            }
+            rf = RandomForestRegressor(random_state=42)
+            grid_search = GridSearchCV(
+                rf, param_grid, cv=5, scoring='r2', n_jobs=-1
+            )
+            grid_search.fit(self.X_train, self.y_train)
+            # Update best model
+            self.models['Random Forest Optimized'] = grid_search.best_estimator_
+            y_pred_opt = grid_search.best_estimator_.predict(self.X_test)
+            # Calculate optimized metrics
+            mae_opt = mean_absolute_error(self.y_test, y_pred_opt)
+            mse_opt = mean_squared_error(self.y_test, y_pred_opt)
+            rmse_opt = np.sqrt(mse_opt)
+            r2_opt = r2_score(self.y_test, y_pred_opt)
+            self.results['Random Forest Optimized'] = {
+                'MAE': mae_opt,
+                'MSE': mse_opt,
+                'RMSE': rmse_opt,
+                'R2': r2_opt,
+                'predictions': y_pred_opt
+            }
+            print(f"Optimized Random Forest:")
+            print(f"  MAE: £{mae_opt:,.2f}")
+            print(f"  RMSE: £{rmse_opt:,.2f}")
+            print(f"  R²: {r2_opt:.4f}")
+            print(f"  Best parameters: {grid_search.best_params_}")
+    def feature_importance_analysis(self):
+        """Analyze feature importance for the best model"""
+        print("\nAnalyzing feature importance...")
+        # Get the best model
+        best_model_name = max(self.results.keys(), key=lambda x: self.results[x]['R2'])
+        best_model = self.models[best_model_name]
+        if hasattr(best_model, 'feature_importances_'):
+            # Get feature names
+            feature_names = self.X_train.columns
+            # Get feature importances
+            importances = best_model.feature_importances_
+            # Create importance dataframe
+            importance_df = pd.DataFrame({
+                'feature': feature_names,
+                'importance': importances
+            }).sort_values('importance', ascending=False)
+            print(f"\nTop 10 Most Important Features ({best_model_name}):")
+            print(importance_df.head(10))
+            # Plot feature importance
+            plt.figure(figsize=(10, 8))
+            top_features = importance_df.head(15)
+            sns.barplot(data=top_features, x='importance', y='feature')
+            plt.title(f'Feature Importance - {best_model_name}')
+            plt.xlabel('Importance')
+            plt.tight_layout()
+            plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
+            plt.show()
+            return importance_df
+        else:
+            print(f"Model {best_model_name} does not support feature importance analysis.")
+            return None
+    def create_predictions_visualization(self):
+        """Create visualizations for model predictions"""
+        print("\nCreating prediction visualizations...")
+        # Create subplots
+        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+        axes = axes.ravel()
+        model_names = list(self.results.keys())
+        for i, (name, results) in enumerate(self.results.items()):
+            if i >= 6:  # Limit to 6 plots
+                break
+            ax = axes[i]
+            y_pred = results['predictions']
+            # Scatter plot: Actual vs Predicted
+            ax.scatter(self.y_test, y_pred, alpha=0.5)
+            ax.plot([self.y_test.min(), self.y_test.max()],
+                   [self.y_test.min(), self.y_test.max()], 'r--', lw=2)
+            ax.set_xlabel('Actual Price (GBP)')
+            ax.set_ylabel('Predicted Price (GBP)')
+            ax.set_title(f'{name}\nR² = {results["R2"]:.4f}')
+            ax.grid(True)
+        # Hide unused subplots
+        for i in range(len(model_names), 6):
+            axes[i].set_visible(False)
+        plt.tight_layout()
+        plt.savefig('model_predictions.png', dpi=300, bbox_inches='tight')
+        plt.show()
+    def model_comparison(self):
+        """Compare all models and create summary"""
+        print("\n" + "="*60)
+        print("MODEL COMPARISON SUMMARY")
+        print("="*60)
+        # Create comparison dataframe
+        comparison_data = []
+        for name, results in self.results.items():
+            comparison_data.append({
+                'Model': name,
+                'MAE (GBP)': f"£{results['MAE']:,.2f}",
+                'RMSE (GBP)': f"£{results['RMSE']:,.2f}",
+                'R² Score': f"{results['R2']:.4f}",
+                'MAE %': f"{(results['MAE'] / self.y_test.mean()) * 100:.2f}%"
+            })
+        comparison_df = pd.DataFrame(comparison_data)
+        comparison_df = comparison_df.sort_values('R² Score', ascending=False)
+        print(comparison_df.to_string(index=False))
+        # Create comparison visualization
+        plt.figure(figsize=(15, 5))
+        # R² Score comparison
+        plt.subplot(1, 3, 1)
+        models = list(self.results.keys())
+        r2_scores = [self.results[model]['R2'] for model in models]
+        plt.bar(models, r2_scores)
+        plt.title('R² Score Comparison')
+        plt.ylabel('R² Score')
+        plt.xticks(rotation=45)
+        # MAE comparison
+        plt.subplot(1, 3, 2)
+        mae_scores = [self.results[model]['MAE'] for model in models]
+        plt.bar(models, mae_scores)
+        plt.title('Mean Absolute Error Comparison')
+        plt.ylabel('MAE (GBP)')
+        plt.xticks(rotation=45)
+        # RMSE comparison
+        plt.subplot(1, 3, 3)
+        rmse_scores = [self.results[model]['RMSE'] for model in models]
+        plt.bar(models, rmse_scores)
+        plt.title('Root Mean Square Error Comparison')
+        plt.ylabel('RMSE (GBP)')
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
+        plt.show()
+    def predict_price(self, property_data):
+        """Predict price for a new property"""
+        # This would be used for making predictions on new data
+        # For now, just return the best model
+        best_model_name = max(self.results.keys(), key=lambda x: self.results[x]['R2'])
+        return self.models[best_model_name]
+    def run_complete_analysis(self):
+        """Run the complete machine learning analysis"""
+        print("UK HOUSING PRICE PREDICTION - MACHINE LEARNING ANALYSIS")
+        print("="*60)
+        self.train_models()
+        self.optimize_best_model()
+        self.feature_importance_analysis()
+        self.create_predictions_visualization()
+        self.model_comparison()
+        print("\n" + "="*60)
+        print("MACHINE LEARNING ANALYSIS COMPLETE")
+        print("="*60)
+        print("Visualizations saved:")
+        print("- feature_importance.png")
+        print("- model_predictions.png")
+        print("- model_comparison.png")
+if __name__ == "__main__":
+    # Initialize predictor
+    predictor = HousingPricePredictor('data/uk_real_estate_dataset_with_revenue (1).csv')
+    # Run complete analysis
+    predictor.run_complete_analysis()

plan_executor.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import pandas as pd
+ALLOWED_OPS = {"mean", "median", "count"}
+def execute_plan(df: pd.DataFrame, plan: dict) -> pd.DataFrame:
+    q = df.copy()
+    # filters
+    for col, rule in (plan.get("filters") or {}).items():
+        if col not in q.columns:
+            raise ValueError("Unknown column: %s" % col)
+        if not isinstance(rule, dict):
+            raise ValueError("Bad filter rule for %s" % col)
+        if "eq" in rule:
+            q = q[q[col] == rule["eq"]]
+        if "in" in rule:
+            q = q[q[col].isin(rule["in"])]
+        if "not_in" in rule:
+            q = q[~q[col].isin(rule["not_in"])]
+        if "gte" in rule:
+            q = q[q[col] >= rule["gte"]]
+        if "lte" in rule:
+            q = q[q[col] <= rule["lte"]]
+    groupby = plan.get("groupby") or []
+    metrics = plan.get("metrics") or []
+    if groupby:
+        gb = q.groupby(groupby, dropna=False)
+        agg_dict = {}
+        for m in metrics:
+            col, op = m.get("col"), m.get("op")
+            label = m.get("label", "%s_%s" % (op, col))
+            if op not in ALLOWED_OPS:
+                raise ValueError("Unsupported op: %s" % op)
+            if op == "count":
+                agg_dict[label] = (col, "count")
+            else:
+                agg_dict[label] = (col, op)
+        res = gb.agg(**agg_dict).reset_index() if agg_dict else gb.size().reset_index(name="count")
+    else:
+        # global summary
+        rows = {}
+        for m in metrics:
+            col, op = m.get("col"), m.get("op")
+            label = m.get("label", "%s_%s" % (op, col))
+            if op not in ALLOWED_OPS:
+                raise ValueError("Unsupported op: %s" % op)
+            if op == "count":
+                rows[label] = int(q[col].count())
+            else:
+                rows[label] = float(getattr(q[col], op)())
+        res = pd.DataFrame([rows]) if rows else q.head(20)
+    for s in (plan.get("sort_by") or []):
+        res = res.sort_values(s.get("col"), ascending=bool(s.get("asc", True)))
+    limit = min(int(plan.get("limit", 20)), 50)
+    return res.head(limit)

predictor.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from datetime import datetime
+from pathlib import Path
+import joblib
+import pandas as pd
+from data_utils import enrich_data, load_raw_data
+# Load once for fallback stats
+_df = enrich_data(load_raw_data())
+MODEL_DIR = Path("models")
+_model = None
+try:
+    if (MODEL_DIR / "gradient_boosting_model.pkl").exists():
+        _model = joblib.load(MODEL_DIR / "gradient_boosting_model.pkl")
+except Exception as e:
+    print("Model artifact loading warning:", e)
+def predict_price(sqft: float, bedrooms: int, bathrooms: int, year_built: int,
+                  quality: int, city: str, prop_type: str):
+    if _model is None:
+        # fallback: price per sqft × sqft with quality tweak
+        city_pps = _df[_df["Location_City"] == city]["Price_Per_Sqft"].mean()
+        if pd.isna(city_pps):
+            city_pps = _df["Price_Per_Sqft"].mean()
+        price = sqft * float(city_pps) * (1 + (quality - 5) * 0.02)
+        return price, False
+    try:
+        age = datetime.now().year - year_built
+        total_rooms = bedrooms + bathrooms
+        size_per_room = sqft / total_rooms if total_rooms else sqft
+        row = pd.DataFrame([{
+            "Square_Footage": sqft,
+            "Bedrooms": bedrooms,
+            "Bathrooms": bathrooms,
+            "Year_Built": year_built,
+            "Property_Age": age,
+            "Quality_Score": quality,
+            "Total_Rooms": total_rooms,
+            "Size_Per_Room": size_per_room,
+            "Is_London": 1 if city == "London" else 0,
+            "Is_Central_London": 0,
+            "Is_Detached": 1 if prop_type == "Detached House" else 0,
+            "Is_Townhouse": 1 if prop_type == "Townhouse" else 0,
+        }])
+        y = _model.predict(row)[0]
+        return float(y), True
+    except Exception:
+        price = sqft * float(_df["Price_Per_Sqft"].mean())
+        return price, False