Spaces:

Lesterchia1
/

HDB_Price_Predictor_R1

Sleeping

File size: 11,206 Bytes

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import gradio as gr
import joblib
from sklearn.preprocessing import LabelEncoder

# Load the trained XGBoost model
try:
    model = joblib.load('best_model_xgboost.joblib')
    print("XGBoost model loaded successfully!")
except FileNotFoundError:
    print("Warning: best_model_xgboost.joblib not found. Using fallback model.")
    model = None
except Exception as e:
    print(f"Error loading model: {e}")
    model = None

# Sample data generation (for market insights when model is available)
def generate_sample_data():
    np.random.seed(42)
    n_samples = 1000
    
    towns = ['ANG MO KIO', 'BEDOK', 'BISHAN', 'BUKIT BATOK', 'BUKIT MERAH', 'BUKIT PANJANG', 'BUKIT TIMAH', 'CENTRAL AREA', 'CHOA CHU KANG', 'CLEMENTI', 'GEYLANG', 'HOUGANG', 'JURONG EAST', 'JURONG WEST', 'KALLANG/WHAMPOA', 'MARINE PARADE', 'PASIR RIS', 'PUNGGOL', 'QUEENSTOWN', 'SEMBAWANG', 'SENGKANG', 'SERANGOON', 'TAMPINES', 'TOA PAYOH', 'WOODLANDS', 'YISHUN' ]
    flat_types = ['2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM', 'EXECUTIVE', 'MULTI-GENERATION']
    flat_models = ['2 ROOM', '3Gen', 'Adjoined flat', 'Apartment', 'DBSS', 'Improved', 'Improved-Maisonette', 'Maisonette', 'Model A', 'Model A-Maisonette', 'Model-A2', 'MULTI-GENERATION', 'New Generation', 'Premium Apartment', 'Premium Apartment Loft', 'Simplified', 'Standard', 'Type S1', 'Type S2']
    
    data = {
        'town': np.random.choice(towns, n_samples),
        'flat_type': np.random.choice(flat_types, n_samples),
        'flat_model': np.random.choice(flat_models, n_samples),
        'floor_area_sqm': np.random.uniform(60, 150, n_samples),
        'storey_level': np.random.randint(1, 25, n_samples),
        'flat_age': np.random.randint(0, 50, n_samples),
        'resale_price': np.random.uniform(200000, 800000, n_samples)
    }
    
    return pd.DataFrame(data)

# Load or create sample data
data = generate_sample_data()

# Create encoders for categorical variables (should match training data)
towns_list = sorted(data['town'].unique().tolist())
flat_types = sorted(data['flat_type'].unique().tolist())
flat_models = sorted(data['flat_model'].unique().tolist())

# Create label encoders (these should match what was used during training)
town_encoder = LabelEncoder()
flat_type_encoder = LabelEncoder()
flat_model_encoder = LabelEncoder()

# Fit encoders with the categories
town_encoder.fit(towns_list)
flat_type_encoder.fit(flat_types)
flat_model_encoder.fit(flat_models)

def simple_xgboost_emulation(input_data):
    """Fallback function if the model is not available"""
    weights = {
        'floor_area_sqm': 5200,
        'storey_level': 1800,
        'flat_age': -2800,
        'remaining_lease': 1200,
        'town_factor': 9500,
        'flat_type_factor': 14500,
        'flat_model_factor': 8500,
        'base_price': 220000,
        'interaction_factor': 500
    }
    
    # Calculate factors
    town_factor = towns_list.index(input_data['town']) * weights['town_factor']
    flat_type_factor = flat_types.index(input_data['flat_type']) * weights['flat_type_factor']
    flat_model_factor = flat_models.index(input_data['flat_model']) * weights['flat_model_factor']
    
    # Simulate tree interactions
    interaction = (input_data['floor_area_sqm'] * input_data['storey_level']) / 100 * weights['interaction_factor']
    
    # Calculate price
    price = (weights['base_price'] +
             input_data['floor_area_sqm'] * weights['floor_area_sqm'] +
             input_data['storey_level'] * weights['storey_level'] +
             input_data['flat_age'] * weights['flat_age'] +
             input_data['remaining_lease'] * weights['remaining_lease'] +
             town_factor + flat_type_factor + flat_model_factor + interaction)
    
    return max(price, 100000)

def preprocess_input(town, flat_type, flat_model, floor_area_sqm, storey_level, flat_age):
    """Preprocess user input into a format suitable for the model."""
    input_data = {
        'town': town,
        'flat_type': flat_type,
        'flat_model': flat_model,
        'floor_area_sqm': float(floor_area_sqm),
        'storey_level': int(storey_level),
        'flat_age': int(flat_age),
        'remaining_lease': 99 - int(flat_age)
    }
    
    return input_data

def prepare_features_for_model(input_data):
    """Prepare features in the exact format expected by the trained model"""
    # Create a DataFrame with the same structure as during training
    features = pd.DataFrame([{
        'town': input_data['town'],
        'flat_type': input_data['flat_type'],
        'flat_model': input_data['flat_model'],
        'floor_area_sqm': input_data['floor_area_sqm'],
        'storey_level': input_data['storey_level'],
        'flat_age': input_data['flat_age'],
        'remaining_lease': input_data['remaining_lease']
    }])
    
    # Encode categorical variables (using the same encoders as during training)
    features['town_encoded'] = town_encoder.transform([input_data['town']])[0]
    features['flat_type_encoded'] = flat_type_encoder.transform([input_data['flat_type']])[0]
    features['flat_model_encoded'] = flat_model_encoder.transform([input_data['flat_model']])[0]
    
    # Select only the numerical features for prediction
    numerical_features = features[['floor_area_sqm', 'storey_level', 'flat_age', 
                                  'remaining_lease', 'town_encoded', 
                                  'flat_type_encoded', 'flat_model_encoded']]
    
    return numerical_features

def predict_with_xgboost(input_data):
    """Make prediction using the loaded XGBoost model"""
    if model is None:
        return simple_xgboost_emulation(input_data)
    
    try:
        # Prepare features
        features = prepare_features_for_model(input_data)
        
        # Make prediction
        prediction = model.predict(features)[0]
        
        return max(prediction, 100000)  # Ensure minimum price
    except Exception as e:
        print(f"Prediction error: {e}")
        # Fallback to emulation
        return simple_xgboost_emulation(input_data)

def create_market_insights_chart(data, town, flat_type, predicted_price):
    """
    Generate a simple text-based market insight.
    """
    # Filter data for the specific town and flat type
    filtered_data = data[(data['town'] == town) & (data['flat_type'] == flat_type)]
    
    if filtered_data.empty:
        return "No historical data available for this town and flat type combination."
    
    # Calculate some basic statistics
    avg_price = filtered_data['resale_price'].mean()
    min_price = filtered_data['resale_price'].min()
    max_price = filtered_data['resale_price'].max()
    count = len(filtered_data)
    
    # Compare prediction with historical average
    price_difference = predicted_price - avg_price
    percentage_diff = (price_difference / avg_price) * 100 if avg_price > 0 else 0
    
    insight_text = f"""
    ## Market Insights for {town} - {flat_type}
    
    - Historical transactions: {count}
    - Average price: ${avg_price:,.2f}
    - Price range: ${min_price:,.2f} - ${max_price:,.2f}
    
    ### Prediction Analysis:
    - Predicted Price: ${predicted_price:,.2f}
    - Difference from average: {percentage_diff:+.1f}%
    
    *Note: Market insights are based on simulated data. Prediction uses {'XGBoost model' if model else 'fallback model'}.*
    """
    
    return insight_text

def predict_hdb_price(town, flat_type, flat_model, floor_area_sqm, storey_level, flat_age):
    """Predict the HDB resale price using the selected model."""
    
    # Validate inputs
    try:
        floor_area_sqm = float(floor_area_sqm)
        storey_level = int(storey_level)
        flat_age = int(flat_age)
        
        if floor_area_sqm <= 0 or storey_level <= 0 or flat_age < 0:
            return "Invalid input: Please enter positive values.", "Invalid input", "Invalid input"
            
    except ValueError:
        return "Please enter valid numbers for floor area, storey level, and flat age.", "Invalid input", "Invalid input"
    
    # Preprocess the user input
    input_data = preprocess_input(town, flat_type, flat_model, floor_area_sqm, storey_level, flat_age)

    # Make prediction using XGBoost model
    predicted_price = predict_with_xgboost(input_data)

    # Generate insights
    insights = create_market_insights_chart(
        data=data, 
        town=town, 
        flat_type=flat_type, 
        predicted_price=predicted_price
    )
    
    model_source = "XGBoost model" if model else "fallback model"
    
    summary = f"""
    ### Property Details 🏡
    - **Town:** {town}
    - **Flat Type:** {flat_type}
    - **Flat Model:** {flat_model}
    - **Floor Area:** {floor_area_sqm} sqm
    - **Storey Level:** {storey_level}
    - **Flat Age:** {flat_age} years

    ---

    ### Prediction Summary
    The predicted price is **${predicted_price:,.2f}**.
    
    *Prediction made using {model_source}. Market insights based on simulated data.*
    """
    
    return f"${predicted_price:,.2f}", insights, summary

# Create the Gradio interface
with gr.Blocks(title="HDB Resale Price Predictor", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🏘️ HDB Resale Price Predictor")
    gr.Markdown("Estimate the resale price of HDB flats in Singapore based on property features.")
    
    # Display model status
    if model:
        gr.Markdown("✅ **XGBoost model loaded successfully!**")
    else:
        gr.Markdown("⚠️ **Using fallback model - XGBoost model not found**")
    
    with gr.Row():
        with gr.Column():
            town = gr.Dropdown(choices=towns_list, label="Town", value="ANG MO KIO")
            flat_type = gr.Dropdown(choices=flat_types, label="Flat Type", value="4 ROOM")
            flat_model = gr.Dropdown(choices=flat_models, label="Flat Model", value="Improved")
            floor_area_sqm = gr.Number(label="Floor Area (sqm)", value=100, minimum=1, maximum=500)
            storey_level = gr.Slider(minimum=1, maximum=50, step=1, label="Storey Level", value=5)
            flat_age = gr.Slider(minimum=0, maximum=99, step=1, label="Flat Age (years)", value=10)
            predict_btn = gr.Button("Predict Price", variant="primary")
        
        with gr.Column():
            price_output = gr.Label(label="Predicted Resale Price")
            insights_output = gr.Markdown()
            summary_output = gr.Markdown()

    
    predict_btn.click(
        fn=predict_hdb_price,
        inputs=[town, flat_type, flat_model, floor_area_sqm, storey_level, flat_age],
        outputs=[price_output, insights_output, summary_output]
    )

    #with gr.Row():
    #    chart_output = gr.Plot(label="📈 Market Insights")

    
    
    gr.Examples(
        examples=[
            ["ANG MO KIO", "4 ROOM", "Improved", 100, 5, 10],
            ["BEDOK", "3 ROOM", "New Generation", 80, 8, 5],
            ["TAMPINES", "5 ROOM", "Model A", 120, 12, 15]
        ],
        inputs=[town, flat_type, flat_model, floor_area_sqm, storey_level, flat_age]
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)