Spaces:

dibend
/

US-Real-Estate-LSTM

Running

File size: 8,716 Bytes

0e46fa9
 
 
 
 
 
 
 
 
d106ab7
 
 
dd5d3b4
 
d106ab7
dd5d3b4
 
d106ab7
5797f05
 
 
dd5d3b4
d106ab7
 
dd5d3b4
5797f05
 
 
 
 
0e46fa9
5797f05
0e46fa9
5797f05
 
 
 
0e46fa9
5797f05
 
 
 
 
dd5d3b4
0e46fa9
5797f05
 
0e46fa9
 
5797f05
 
 
 
 
 
 
 
 
dd5d3b4
5797f05
0e46fa9
dd5d3b4
 
0e46fa9
5797f05
 
0e46fa9
5797f05
 
 
dd5d3b4
 
5797f05
dd5d3b4
0e46fa9
 
 
 
 
 
5797f05
0e46fa9
 
 
 
5797f05
 
 
 
 
 
 
 
 
 
0e46fa9
5797f05
dd5d3b4
5797f05
 
dd5d3b4
 
 
 
 
5797f05
 
 
 
dd5d3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
5797f05
 
 
 
 
 
dd5d3b4
5797f05
 
 
 
 
 
 
dd5d3b4
 
 
5797f05
 
0e46fa9
 
5797f05
dd5d3b4
 
 
 
 
 
 
 
 
 
 
 
0e46fa9
5797f05
dd5d3b4

import gradio as gr
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

lstm_explanation = """
## Understanding LSTM in This App

**What is LSTM?**
LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.

**How is it used here?**
- The LSTM model uses housing price data since January 2000 for the selected ZIP code.
- It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
- The model learns trends, such as seasonal changes or long-term growth.
- 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
- 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
- 'LSTM Future Predictions' shows the model's predictions for months *beyond* the available historical data.
"""

def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12, future_months_to_project=6):
    try:
        zip_val = int(zip_code_str)
    except ValueError:
        return px.line(title=f"Invalid ZIP Code: '{zip_code_str}'. Please enter a numeric ZIP code.")

    # Read the CSV file
    df_full = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

    # Extract the data for the given zip code
    df_zip_subset = df_full[df_full['RegionName'] == zip_val]
    if df_zip_subset.empty:
        return px.line(title=f'No data found for Zip Code {zip_val}')

    # Select the columns with dates and process
    df_processed = df_zip_subset.loc[:, '2000-01-31':]
    df_processed = df_processed.T.reset_index()
    df_processed.columns = ['Date', 'Price']
    df_processed['Date'] = pd.to_datetime(df_processed['Date'])
    df_processed.dropna(subset=['Price'], inplace=True)  # Remove rows with NaN prices if any

    if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
        return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')

    # Compute the moving averages
    for window in [3, 6, 12, 24]:
        df_processed[f'{window}-Month MA'] = df_processed['Price'].rolling(window).mean()

    # --- Prepare data for LSTM ---
    prices = df_processed['Price'].values.reshape(-1, 1)

    # Define split point for scaler fitting (all data except the hold-out "future" part)
    train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
    if train_scaler_fit_size < 60:
        return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
    train_prices_for_scaler = prices[:train_scaler_fit_size]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(train_prices_for_scaler)  # Fit scaler ONLY on the training portion
    scaled_data_full = scaler.transform(prices)  # Transform the entire dataset

    # Create the training sequences (from the part the scaler was fit on)
    train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
    x_train, y_train = [], []
    for i in range(60, len(train_sequences_source_data)):
        x_train.append(train_sequences_source_data[i-60:i, 0])
        y_train.append(train_sequences_source_data[i, 0])

    if not x_train:  # Should be caught by earlier checks, but as a safeguard
        return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')

    x_train, y_train = np.array(x_train), np.array(y_train)
    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
    model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
    model.add(Dense(units=25))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
    model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
              callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio

    # --- Predictions ---
    # 1. Past predictions (on the training data part for visualization of fit)
    past_predictions_scaled = model.predict(x_train)
    past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
    # Dates for these past predictions align with y_train targets
    past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)

    # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
    x_test_holdout = []
    for i in range(future_months_to_predict_on_holdout):
        seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
        seq_end_idx = train_scaler_fit_size + i   # End of sequence relative to `prices`
        x_test_holdout.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
    x_test_holdout = np.array(x_test_holdout)
    x_test_holdout = np.reshape(x_test_holdout, (x_test_holdout.shape[0], x_test_holdout.shape[1], 1))
    holdout_predictions_scaled = model.predict(x_test_holdout)
    holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
    # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
    holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)

    # 3. Future predictions (beyond the available data)
    future_predictions_scaled = []
    last_sequence = scaled_data_full[-60:].reshape(1, 60, 1)
    for _ in range(future_months_to_project):
        predicted_scaled = model.predict(last_sequence)
        future_predictions_scaled.append(predicted_scaled[0, 0])
        last_sequence = np.concatenate([last_sequence[:, 1:, :], predicted_scaled.reshape(1, 1, 1)], axis=1)

    future_predictions_actual_scale = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
    last_actual_date = df_processed['Date'].iloc[-1]
    future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_project + 1, freq='M')[1:]

    future_predictions_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': future_predictions_actual_scale.flatten()})

    # --- Plotting ---
    fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
    fig.data[0].showlegend = True
    fig.data[0].name = 'Actual Price'
    for window in [3, 6, 12, 24]:
        fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')

    # Plot past (training set) predictions
    if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
        fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')

    # Plot predictions on the hold-out set
    if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
        fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')

    # Plot future predictions
    fig.add_scatter(x=future_predictions_df['Date'], y=future_predictions_df['Predicted Price'], mode='lines', line=dict(color='green'), name='LSTM Future Predictions')

    fig.update_layout(legend_title_text='Legend')
    return fig

# --- Gradio Interface ---
iface = gr.Interface(
    fn=plot_real_estate,
    inputs=[
        gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
        gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1),
        gr.Slider(label="Months to Predict into the Future", minimum=3, maximum=24, value=6, step=1)
    ],
    outputs=gr.Plot(),
    title="Real Estate Price Analysis with LSTM Prediction",
    description=lstm_explanation,
    allow_flagging='never'
)

if __name__ == '__main__':
    iface.launch(share=False, debug=True)