Spaces:
Running
Running
File size: 8,716 Bytes
0e46fa9 d106ab7 dd5d3b4 d106ab7 dd5d3b4 d106ab7 5797f05 dd5d3b4 d106ab7 dd5d3b4 5797f05 0e46fa9 5797f05 0e46fa9 5797f05 0e46fa9 5797f05 dd5d3b4 0e46fa9 5797f05 0e46fa9 5797f05 dd5d3b4 5797f05 0e46fa9 dd5d3b4 0e46fa9 5797f05 0e46fa9 5797f05 dd5d3b4 5797f05 dd5d3b4 0e46fa9 5797f05 0e46fa9 5797f05 0e46fa9 5797f05 dd5d3b4 5797f05 dd5d3b4 5797f05 dd5d3b4 5797f05 dd5d3b4 5797f05 dd5d3b4 5797f05 0e46fa9 5797f05 dd5d3b4 0e46fa9 5797f05 dd5d3b4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import gradio as gr
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
lstm_explanation = """
## Understanding LSTM in This App
**What is LSTM?**
LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
**How is it used here?**
- The LSTM model uses housing price data since January 2000 for the selected ZIP code.
- It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
- The model learns trends, such as seasonal changes or long-term growth.
- 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
- 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
- 'LSTM Future Predictions' shows the model's predictions for months *beyond* the available historical data.
"""
def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12, future_months_to_project=6):
try:
zip_val = int(zip_code_str)
except ValueError:
return px.line(title=f"Invalid ZIP Code: '{zip_code_str}'. Please enter a numeric ZIP code.")
# Read the CSV file
df_full = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
# Extract the data for the given zip code
df_zip_subset = df_full[df_full['RegionName'] == zip_val]
if df_zip_subset.empty:
return px.line(title=f'No data found for Zip Code {zip_val}')
# Select the columns with dates and process
df_processed = df_zip_subset.loc[:, '2000-01-31':]
df_processed = df_processed.T.reset_index()
df_processed.columns = ['Date', 'Price']
df_processed['Date'] = pd.to_datetime(df_processed['Date'])
df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
# Compute the moving averages
for window in [3, 6, 12, 24]:
df_processed[f'{window}-Month MA'] = df_processed['Price'].rolling(window).mean()
# --- Prepare data for LSTM ---
prices = df_processed['Price'].values.reshape(-1, 1)
# Define split point for scaler fitting (all data except the hold-out "future" part)
train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
if train_scaler_fit_size < 60:
return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
train_prices_for_scaler = prices[:train_scaler_fit_size]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
scaled_data_full = scaler.transform(prices) # Transform the entire dataset
# Create the training sequences (from the part the scaler was fit on)
train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
x_train, y_train = [], []
for i in range(60, len(train_sequences_source_data)):
x_train.append(train_sequences_source_data[i-60:i, 0])
y_train.append(train_sequences_source_data[i, 0])
if not x_train: # Should be caught by earlier checks, but as a safeguard
return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
model.add(Dense(units=25))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
# --- Predictions ---
# 1. Past predictions (on the training data part for visualization of fit)
past_predictions_scaled = model.predict(x_train)
past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
# Dates for these past predictions align with y_train targets
past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
# 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
x_test_holdout = []
for i in range(future_months_to_predict_on_holdout):
seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
seq_end_idx = train_scaler_fit_size + i # End of sequence relative to `prices`
x_test_holdout.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
x_test_holdout = np.array(x_test_holdout)
x_test_holdout = np.reshape(x_test_holdout, (x_test_holdout.shape[0], x_test_holdout.shape[1], 1))
holdout_predictions_scaled = model.predict(x_test_holdout)
holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
# Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
# 3. Future predictions (beyond the available data)
future_predictions_scaled = []
last_sequence = scaled_data_full[-60:].reshape(1, 60, 1)
for _ in range(future_months_to_project):
predicted_scaled = model.predict(last_sequence)
future_predictions_scaled.append(predicted_scaled[0, 0])
last_sequence = np.concatenate([last_sequence[:, 1:, :], predicted_scaled.reshape(1, 1, 1)], axis=1)
future_predictions_actual_scale = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
last_actual_date = df_processed['Date'].iloc[-1]
future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_project + 1, freq='M')[1:]
future_predictions_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': future_predictions_actual_scale.flatten()})
# --- Plotting ---
fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
fig.data[0].showlegend = True
fig.data[0].name = 'Actual Price'
for window in [3, 6, 12, 24]:
fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
# Plot past (training set) predictions
if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
# Plot predictions on the hold-out set
if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
# Plot future predictions
fig.add_scatter(x=future_predictions_df['Date'], y=future_predictions_df['Predicted Price'], mode='lines', line=dict(color='green'), name='LSTM Future Predictions')
fig.update_layout(legend_title_text='Legend')
return fig
# --- Gradio Interface ---
iface = gr.Interface(
fn=plot_real_estate,
inputs=[
gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1),
gr.Slider(label="Months to Predict into the Future", minimum=3, maximum=24, value=6, step=1)
],
outputs=gr.Plot(),
title="Real Estate Price Analysis with LSTM Prediction",
description=lstm_explanation,
allow_flagging='never'
)
if __name__ == '__main__':
iface.launch(share=False, debug=True) |