Spaces:

dibend
/

US-Real-Estate-LSTM

Running

App Files Files Community

dibend commited on May 17, 2025

Commit

5797f05

verified ·

1 Parent(s): d106ab7

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -66

app.py CHANGED Viewed

@@ -10,104 +10,149 @@ from tensorflow.keras.callbacks import EarlyStopping
 lstm_explanation = """
 ## Understanding LSTM in This App
-**What is LSTM?**
-LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
-**How is it used here?**
-- The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
-- The model learns trends, such as seasonal changes or long-term growth, and generates predictions for future months, shown as "Future Predictions" in the chart.
 """
-def plot_real_estate(zip, future_months=12):
     # Read the CSV file
-    df = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
-    # Extract the data for zip code
-    df = df[df['RegionName'] == int(zip)]
-    # Select the columns with dates
-    df = df.loc[:, '2000-01-31':]
-    # Transpose the data
-    df = df.T.reset_index()
-    df.columns = ['Date', 'Price']
-    # Convert 'Date' to datetime
-    df['Date'] = pd.to_datetime(df['Date'])
     # Compute the moving averages
-    df['3-Month MA'] = df['Price'].rolling(3).mean()
-    df['6-Month MA'] = df['Price'].rolling(6).mean()
-    df['12-Month MA'] = df['Price'].rolling(12).mean()
-    df['24-Month MA'] = df['Price'].rolling(24).mean()
-    # Plot the price data and moving averages
-    fig = px.line(df, x='Date', y='Price', title=f'Housing Prices for Zip Code {zip}')
-    fig['data'][0]['showlegend'] = True
-    fig['data'][0]['name'] = 'Price'
-    fig.add_scatter(x=df['Date'], y=df['3-Month MA'], mode='lines', name='3-Month MA')
-    fig.add_scatter(x=df['Date'], y=df['6-Month MA'], mode='lines', name='6-Month MA')
-    fig.add_scatter(x=df['Date'], y=df['12-Month MA'], mode='lines', name='12-Month MA')
-    fig.add_scatter(x=df['Date'], y=df['24-Month MA'], mode='lines', name='24-Month MA')
-    # Prepare data for LSTM
     scaler = MinMaxScaler(feature_range=(0, 1))
-    scaled_data = scaler.fit_transform(df['Price'].values.reshape(-1, 1))
-    # Create the training data set
-    train_data = scaled_data[:-future_months]
     x_train, y_train = [], []
-    for i in range(60, len(train_data)):
-        x_train.append(train_data[i-60:i, 0])
-        y_train.append(train_data[i, 0])
     x_train, y_train = np.array(x_train), np.array(y_train)
     x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
     # Build the LSTM model
     model = Sequential()
     model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
-    model.add(LSTM(units=50, return_sequences=False))
     model.add(Dense(units=25))
     model.add(Dense(units=1))
-    # Compile the model
     model.compile(optimizer='adam', loss='mean_squared_error')
-    # Train the model
-    model.fit(x_train, y_train, batch_size=1, epochs=1, callbacks=[EarlyStopping(monitor='loss', patience=2)])
-    # Test the model
-    test_data = scaled_data[-(60+future_months):]
     x_test = []
-    for i in range(60, len(test_data)):
-        x_test.append(test_data[i-60:i, 0])
     x_test = np.array(x_test)
     x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
-    predictions = model.predict(x_test)
-    predictions = scaler.inverse_transform(predictions)
-    # Past predictions for visualization
-    past_predictions = model.predict(x_train)
-    past_predictions = scaler.inverse_transform(past_predictions)
-    past_dates = df['Date'].iloc[60:len(train_data)].reset_index(drop=True)
-    future_dates = pd.date_range(start=df['Date'].iloc[-1], periods=future_months+1, freq='M')[1:]
-    future_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': predictions.flatten()})
-    # Plot past predictions
-    fig.add_scatter(x=past_dates, y=past_predictions.flatten(), mode='lines', line=dict(dash='dash'), name='Past Predictions')
-    # Plot future predictions
-    fig.add_scatter(x=future_df['Date'], y=future_df['Predicted Price'], mode='lines', name='Future Predictions')
     return fig
 iface = gr.Interface(fn=plot_real_estate,
-                     inputs=[gr.Textbox(label="Zip Code"), gr.Slider(label="Months to Predict", minimum=1, maximum=24, step=1)],
                      outputs=gr.Plot(),
-                     description=lstm_explanation)
-iface.launch(share=False, debug=True)

 lstm_explanation = """
 ## Understanding LSTM in This App
+**What is LSTM?** LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
+**How is it used here?** - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
+- The model learns trends, such as seasonal changes or long-term growth.
+- 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
+- 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
 """
+def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
+    try:
+        zip_val = int(zip_code_str)
+    except ValueError:
+        return px.line(title=f"Invalid ZIP Code: '{zip_code_str}'. Please enter a numeric ZIP code.")
     # Read the CSV file
+    df_full = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
+    # Extract the data for the given zip code
+    df_zip_subset = df_full[df_full['RegionName'] == zip_val]
+    if df_zip_subset.empty:
+        return px.line(title=f'No data found for Zip Code {zip_val}')
+    # Select the columns with dates and process
+    df_processed = df_zip_subset.loc[:, '2000-01-31':]
+    df_processed = df_processed.T.reset_index()
+    df_processed.columns = ['Date', 'Price']
+    df_processed['Date'] = pd.to_datetime(df_processed['Date'])
+    df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
+    if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
+        return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
     # Compute the moving averages
+    for window in [3, 6, 12, 24]:
+        df_processed[f'{window}-Month MA'] = df_processed['Price'].rolling(window).mean()
+    # --- Prepare data for LSTM ---
+    prices = df_processed['Price'].values.reshape(-1, 1)
+    # Define split point for scaler fitting (all data except the hold-out "future" part)
+    # Ensure there's enough data to form at least one 60-month sequence for training
+    train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
+    if train_scaler_fit_size < 60:
+         return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
+    train_prices_for_scaler = prices[:train_scaler_fit_size]
     scaler = MinMaxScaler(feature_range=(0, 1))
+    scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
+    scaled_data_full = scaler.transform(prices) # Transform the entire dataset
+    # Create the training sequences (from the part the scaler was fit on)
+    train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
     x_train, y_train = [], []
+    for i in range(60, len(train_sequences_source_data)):
+        x_train.append(train_sequences_source_data[i-60:i, 0])
+        y_train.append(train_sequences_source_data[i, 0])
+    if not x_train: # Should be caught by earlier checks, but as a safeguard
+        return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
     x_train, y_train = np.array(x_train), np.array(y_train)
     x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
     # Build the LSTM model
     model = Sequential()
     model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
+    model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
     model.add(Dense(units=25))
     model.add(Dense(units=1))
     model.compile(optimizer='adam', loss='mean_squared_error')
+    # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
+    # Example: epochs=50, patience=10. Using your original for direct comparison now.
+    model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
+              callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
+    # --- Predictions ---
+    # 1. Past predictions (on the training data part for visualization of fit)
+    past_predictions_scaled = model.predict(x_train)
+    past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
+    # Dates for these past predictions align with y_train targets
+    past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
+    # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
     x_test = []
+    # Create input sequences for the hold-out period
+    for i in range(future_months_to_predict_on_holdout):
+        seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
+        seq_end_idx = train_scaler_fit_size + i       # End of sequence relative to `prices`
+        x_test.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
     x_test = np.array(x_test)
     x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
+    holdout_predictions_scaled = model.predict(x_test)
+    holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
+    # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
+    holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
+    # --- Plotting ---
+    fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
+    fig.data[0].showlegend = True
+    fig.data[0].name = 'Actual Price'
+    for window in [3, 6, 12, 24]:
+        fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
+    # Plot past (training set) predictions
+    if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
+        fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
+    # Plot predictions on the hold-out set
+    if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
+        fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
+    # If you want to project dates *beyond* your current dataset for *iterative* future predictions (not done here):
+    # last_actual_date = df_processed['Date'].iloc[-1]
+    # projected_future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_predict_on_holdout + 1, freq='ME')[1:]
+    # And then you would need an iterative prediction loop to generate data for `projected_future_dates`.
+    # Your original 'future_dates' and 'future_df' were plotting the hold-out predictions against such projected dates.
+    # The current plotting aligns hold-out predictions with their actual corresponding dates.
+    fig.update_layout(legend_title_text='Legend')
     return fig
+# --- Gradio Interface ---
 iface = gr.Interface(fn=plot_real_estate,
+                     inputs=[
+                         gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
+                         gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1)
+                     ],
                      outputs=gr.Plot(),
+                     title="Real Estate Price Analysis with LSTM Prediction",
+                     description=lstm_explanation,
+                     allow_flagging='never')
+if __name__ == '__main__':
+    iface.launch(share=False, debug=True) # share=True to create public link (if needed)