Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split, GridSearchCV | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import gradio as gr | |
| # URL to the Excel dataset on Hugging Face | |
| data_url = "https://huggingface.co/datasets/leadingbridge/flat/resolve/main/NorthPoint30.xlsx" | |
| # Load dataset (using openpyxl as the engine) | |
| df = pd.read_excel(data_url, engine="openpyxl") | |
| # Drop columns that are not needed for prediction | |
| cols_to_drop = ['Usage', 'Address', 'PricePerSquareFeet', 'InstrumentDate', 'Floor', 'Unit'] | |
| df.drop(columns=cols_to_drop, inplace=True, errors='ignore') | |
| # Rename useful columns for consistency | |
| df.rename(columns={"Floor.1": "Floor", "Unit.1": "Unit"}, inplace=True) | |
| # Ensure the dataset has the necessary columns | |
| required_columns = ['District', 'PriceInMillion', 'Longitude', 'Latitude', 'Floor', 'Unit', 'Area', 'Year', 'WeekNumber'] | |
| if not all(col in df.columns for col in required_columns): | |
| raise ValueError("Dataset is missing one or more required columns.") | |
| # Define features and target variable | |
| feature_names = ['District', 'Longitude', 'Latitude', 'Floor', 'Unit', 'Area', 'Year', 'WeekNumber'] | |
| X = df[feature_names] | |
| y = df['PriceInMillion'] | |
| # Split into training and test sets | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Define a parameter grid for RandomForestRegressor and perform grid search | |
| rf_param_grid = { | |
| 'n_estimators': [50, 100, 150], | |
| "max_depth": [4, 6, 8], | |
| "max_features": ['sqrt', 'log2', 3], | |
| "random_state": [42] | |
| } | |
| rf_grid = GridSearchCV(RandomForestRegressor(), rf_param_grid, refit=True, verbose=1, cv=5, error_score='raise') | |
| rf_grid.fit(X_train, y_train) | |
| # Use the best estimator | |
| model = rf_grid.best_estimator_ | |
| # Print model performance on the test set | |
| rf_pred = model.predict(X_test) | |
| rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred)) | |
| rf_r2 = r2_score(y_test, rf_pred) | |
| print("Random Forest RMSE: ", rf_rmse) | |
| print("Random Forest R2: ", rf_r2) | |
| def price_prediction(model, feature_names, new_data): | |
| new_data_df = pd.DataFrame([new_data], columns=feature_names) | |
| prediction = model.predict(new_data_df) | |
| return prediction[0] | |
| def predict(district, longitude, latitude, floor, unit, area, year, weeknumber): | |
| new_data = [district, longitude, latitude, floor, unit, area, year, weeknumber] | |
| prediction = price_prediction(model, feature_names, new_data) | |
| return f"${prediction:,.2f} Million" | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Dropdown(choices=list(range(1, 9)), label='District (1 = Taikoo Shing, 2 = Mei Foo Sun Chuen, 3 = South Horizons, 4 = Whampoa Garden)'), | |
| gr.Number(label='Longitude'), | |
| gr.Number(label='Latitude'), | |
| gr.Dropdown(choices=list(range(1, 71)), label='Floor'), | |
| gr.Dropdown(choices=list(range(1, 31)), label='Unit (e.g., A=1, B=2, C=3, ...)'), | |
| gr.Slider(minimum=137, maximum=5000, step=1, label='Area (in sq. feet)'), | |
| gr.Dropdown(choices=[2024, 2025], label='Year'), | |
| gr.Dropdown(choices=list(range(1, 53)), label='Week Number') | |
| ], | |
| outputs=gr.Textbox(label='Price Prediction'), | |
| title="PROPERTY PRICE PREDICTION TOOL (Larry Pang Final Year Project)", | |
| description="Predict the price of a new property based on District, Longitude, Latitude, Floor, Unit, Area, Year, and Week Number." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |