import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import gradio as gr # URL to the Excel dataset on Hugging Face data_url = "https://huggingface.co/datasets/leadingbridge/flat/resolve/main/NorthPoint30.xlsx" # Load dataset (using openpyxl as the engine) df = pd.read_excel(data_url, engine="openpyxl") # Drop columns that are not needed for prediction cols_to_drop = ['Usage', 'Address', 'PricePerSquareFeet', 'InstrumentDate', 'Floor', 'Unit'] df.drop(columns=cols_to_drop, inplace=True, errors='ignore') # Rename useful columns for consistency df.rename(columns={"Floor.1": "Floor", "Unit.1": "Unit"}, inplace=True) # Ensure the dataset has the necessary columns required_columns = ['District', 'PriceInMillion', 'Longitude', 'Latitude', 'Floor', 'Unit', 'Area', 'Year', 'WeekNumber'] if not all(col in df.columns for col in required_columns): raise ValueError("Dataset is missing one or more required columns.") # Define features and target variable feature_names = ['District', 'Longitude', 'Latitude', 'Floor', 'Unit', 'Area', 'Year', 'WeekNumber'] X = df[feature_names] y = df['PriceInMillion'] # Split into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Define a parameter grid for RandomForestRegressor and perform grid search rf_param_grid = { 'n_estimators': [50, 100, 150], "max_depth": [4, 6, 8], "max_features": ['sqrt', 'log2', 3], "random_state": [42] } rf_grid = GridSearchCV(RandomForestRegressor(), rf_param_grid, refit=True, verbose=1, cv=5, error_score='raise') rf_grid.fit(X_train, y_train) # Use the best estimator model = rf_grid.best_estimator_ # Print model performance on the test set rf_pred = model.predict(X_test) rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred)) rf_r2 = r2_score(y_test, rf_pred) print("Random Forest RMSE: ", rf_rmse) print("Random Forest R2: ", rf_r2) def price_prediction(model, feature_names, new_data): new_data_df = pd.DataFrame([new_data], columns=feature_names) prediction = model.predict(new_data_df) return prediction[0] def predict(district, longitude, latitude, floor, unit, area, year, weeknumber): new_data = [district, longitude, latitude, floor, unit, area, year, weeknumber] prediction = price_prediction(model, feature_names, new_data) return f"${prediction:,.2f} Million" # Gradio Interface iface = gr.Interface( fn=predict, inputs=[ gr.Dropdown(choices=list(range(1, 9)), label='District (1 = Taikoo Shing, 2 = Mei Foo Sun Chuen, 3 = South Horizons, 4 = Whampoa Garden)'), gr.Number(label='Longitude'), gr.Number(label='Latitude'), gr.Dropdown(choices=list(range(1, 71)), label='Floor'), gr.Dropdown(choices=list(range(1, 31)), label='Unit (e.g., A=1, B=2, C=3, ...)'), gr.Slider(minimum=137, maximum=5000, step=1, label='Area (in sq. feet)'), gr.Dropdown(choices=[2024, 2025], label='Year'), gr.Dropdown(choices=list(range(1, 53)), label='Week Number') ], outputs=gr.Textbox(label='Price Prediction'), title="PROPERTY PRICE PREDICTION TOOL (Larry Pang Final Year Project)", description="Predict the price of a new property based on District, Longitude, Latitude, Floor, Unit, Area, Year, and Week Number." ) if __name__ == "__main__": iface.launch()