dibend's picture
Update app.py
dd5d3b4 verified
import gradio as gr
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
lstm_explanation = """
## Understanding LSTM in This App
**What is LSTM?**
LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
**How is it used here?**
- The LSTM model uses housing price data since January 2000 for the selected ZIP code.
- It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
- The model learns trends, such as seasonal changes or long-term growth.
- 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
- 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
- 'LSTM Future Predictions' shows the model's predictions for months *beyond* the available historical data.
"""
def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12, future_months_to_project=6):
try:
zip_val = int(zip_code_str)
except ValueError:
return px.line(title=f"Invalid ZIP Code: '{zip_code_str}'. Please enter a numeric ZIP code.")
# Read the CSV file
df_full = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
# Extract the data for the given zip code
df_zip_subset = df_full[df_full['RegionName'] == zip_val]
if df_zip_subset.empty:
return px.line(title=f'No data found for Zip Code {zip_val}')
# Select the columns with dates and process
df_processed = df_zip_subset.loc[:, '2000-01-31':]
df_processed = df_processed.T.reset_index()
df_processed.columns = ['Date', 'Price']
df_processed['Date'] = pd.to_datetime(df_processed['Date'])
df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
# Compute the moving averages
for window in [3, 6, 12, 24]:
df_processed[f'{window}-Month MA'] = df_processed['Price'].rolling(window).mean()
# --- Prepare data for LSTM ---
prices = df_processed['Price'].values.reshape(-1, 1)
# Define split point for scaler fitting (all data except the hold-out "future" part)
train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
if train_scaler_fit_size < 60:
return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
train_prices_for_scaler = prices[:train_scaler_fit_size]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
scaled_data_full = scaler.transform(prices) # Transform the entire dataset
# Create the training sequences (from the part the scaler was fit on)
train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
x_train, y_train = [], []
for i in range(60, len(train_sequences_source_data)):
x_train.append(train_sequences_source_data[i-60:i, 0])
y_train.append(train_sequences_source_data[i, 0])
if not x_train: # Should be caught by earlier checks, but as a safeguard
return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
model.add(Dense(units=25))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
# --- Predictions ---
# 1. Past predictions (on the training data part for visualization of fit)
past_predictions_scaled = model.predict(x_train)
past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
# Dates for these past predictions align with y_train targets
past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
# 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
x_test_holdout = []
for i in range(future_months_to_predict_on_holdout):
seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
seq_end_idx = train_scaler_fit_size + i # End of sequence relative to `prices`
x_test_holdout.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
x_test_holdout = np.array(x_test_holdout)
x_test_holdout = np.reshape(x_test_holdout, (x_test_holdout.shape[0], x_test_holdout.shape[1], 1))
holdout_predictions_scaled = model.predict(x_test_holdout)
holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
# Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
# 3. Future predictions (beyond the available data)
future_predictions_scaled = []
last_sequence = scaled_data_full[-60:].reshape(1, 60, 1)
for _ in range(future_months_to_project):
predicted_scaled = model.predict(last_sequence)
future_predictions_scaled.append(predicted_scaled[0, 0])
last_sequence = np.concatenate([last_sequence[:, 1:, :], predicted_scaled.reshape(1, 1, 1)], axis=1)
future_predictions_actual_scale = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
last_actual_date = df_processed['Date'].iloc[-1]
future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_project + 1, freq='M')[1:]
future_predictions_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': future_predictions_actual_scale.flatten()})
# --- Plotting ---
fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
fig.data[0].showlegend = True
fig.data[0].name = 'Actual Price'
for window in [3, 6, 12, 24]:
fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
# Plot past (training set) predictions
if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
# Plot predictions on the hold-out set
if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
# Plot future predictions
fig.add_scatter(x=future_predictions_df['Date'], y=future_predictions_df['Predicted Price'], mode='lines', line=dict(color='green'), name='LSTM Future Predictions')
fig.update_layout(legend_title_text='Legend')
return fig
# --- Gradio Interface ---
iface = gr.Interface(
fn=plot_real_estate,
inputs=[
gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1),
gr.Slider(label="Months to Predict into the Future", minimum=3, maximum=24, value=6, step=1)
],
outputs=gr.Plot(),
title="Real Estate Price Analysis with LSTM Prediction",
description=lstm_explanation,
allow_flagging='never'
)
if __name__ == '__main__':
iface.launch(share=False, debug=True)