Spaces:

dibend
/

US-Real-Estate-LSTM

Running

App Files Files Community

dibend commited on May 27, 2025

Commit

dd5d3b4

verified ·

1 Parent(s): ab1aec6

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -48

app.py CHANGED Viewed

@@ -10,16 +10,19 @@ from tensorflow.keras.callbacks import EarlyStopping
 lstm_explanation = """
 ## Understanding LSTM in This App
-**What is LSTM?** LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
-**How is it used here?** - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
 - The model learns trends, such as seasonal changes or long-term growth.
 - 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
 - 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
 """
-def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     try:
         zip_val = int(zip_code_str)
     except ValueError:
@@ -30,7 +33,6 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     # Extract the data for the given zip code
     df_zip_subset = df_full[df_full['RegionName'] == zip_val]
     if df_zip_subset.empty:
         return px.line(title=f'No data found for Zip Code {zip_val}')
@@ -39,7 +41,7 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     df_processed = df_processed.T.reset_index()
     df_processed.columns = ['Date', 'Price']
     df_processed['Date'] = pd.to_datetime(df_processed['Date'])
-    df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
     if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
         return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
@@ -52,29 +54,24 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     prices = df_processed['Price'].values.reshape(-1, 1)
     # Define split point for scaler fitting (all data except the hold-out "future" part)
-    # Ensure there's enough data to form at least one 60-month sequence for training
     train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
     if train_scaler_fit_size < 60:
-         return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
     train_prices_for_scaler = prices[:train_scaler_fit_size]
     scaler = MinMaxScaler(feature_range=(0, 1))
-    scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
-    scaled_data_full = scaler.transform(prices) # Transform the entire dataset
     # Create the training sequences (from the part the scaler was fit on)
     train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
     x_train, y_train = [], []
     for i in range(60, len(train_sequences_source_data)):
         x_train.append(train_sequences_source_data[i-60:i, 0])
         y_train.append(train_sequences_source_data[i, 0])
-    if not x_train: # Should be caught by earlier checks, but as a safeguard
         return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
     x_train, y_train = np.array(x_train), np.array(y_train)
     x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
@@ -84,16 +81,13 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
     model.add(Dense(units=25))
     model.add(Dense(units=1))
     model.compile(optimizer='adam', loss='mean_squared_error')
     # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
-    # Example: epochs=50, patience=10. Using your original for direct comparison now.
     model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
               callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
     # --- Predictions ---
     # 1. Past predictions (on the training data part for visualization of fit)
     past_predictions_scaled = model.predict(x_train)
     past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
@@ -101,30 +95,39 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
     # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
-    x_test = []
-    # Create input sequences for the hold-out period
     for i in range(future_months_to_predict_on_holdout):
         seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
-        seq_end_idx = train_scaler_fit_size + i       # End of sequence relative to `prices`
-        x_test.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
-    x_test = np.array(x_test)
-    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
-    holdout_predictions_scaled = model.predict(x_test)
     holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
     # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
     holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
     # --- Plotting ---
     fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
     fig.data[0].showlegend = True
     fig.data[0].name = 'Actual Price'
     for window in [3, 6, 12, 24]:
         fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
     # Plot past (training set) predictions
     if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
         fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
@@ -132,27 +135,26 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
     # Plot predictions on the hold-out set
     if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
         fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
-    # If you want to project dates *beyond* your current dataset for *iterative* future predictions (not done here):
-    # last_actual_date = df_processed['Date'].iloc[-1]
-    # projected_future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_predict_on_holdout + 1, freq='ME')[1:]
-    # And then you would need an iterative prediction loop to generate data for `projected_future_dates`.
-    # Your original 'future_dates' and 'future_df' were plotting the hold-out predictions against such projected dates.
-    # The current plotting aligns hold-out predictions with their actual corresponding dates.
     fig.update_layout(legend_title_text='Legend')
     return fig
 # --- Gradio Interface ---
-iface = gr.Interface(fn=plot_real_estate,
-                     inputs=[
-                         gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
-                         gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1)
-                     ],
-                     outputs=gr.Plot(),
-                     title="Real Estate Price Analysis with LSTM Prediction",
-                     description=lstm_explanation,
-                     allow_flagging='never')
 if __name__ == '__main__':
-    iface.launch(share=False, debug=True) # share=True to create public link (if needed)

 lstm_explanation = """
 ## Understanding LSTM in This App
+**What is LSTM?**
+LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
+**How is it used here?**
+- The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
 - The model learns trends, such as seasonal changes or long-term growth.
 - 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
 - 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
+- 'LSTM Future Predictions' shows the model's predictions for months *beyond* the available historical data.
 """
+def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12, future_months_to_project=6):
     try:
         zip_val = int(zip_code_str)
     except ValueError:
     # Extract the data for the given zip code
     df_zip_subset = df_full[df_full['RegionName'] == zip_val]
     if df_zip_subset.empty:
         return px.line(title=f'No data found for Zip Code {zip_val}')
     df_processed = df_processed.T.reset_index()
     df_processed.columns = ['Date', 'Price']
     df_processed['Date'] = pd.to_datetime(df_processed['Date'])
+    df_processed.dropna(subset=['Price'], inplace=True)  # Remove rows with NaN prices if any
     if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
         return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
     prices = df_processed['Price'].values.reshape(-1, 1)
     # Define split point for scaler fitting (all data except the hold-out "future" part)
     train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
     if train_scaler_fit_size < 60:
+        return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
     train_prices_for_scaler = prices[:train_scaler_fit_size]
     scaler = MinMaxScaler(feature_range=(0, 1))
+    scaler.fit(train_prices_for_scaler)  # Fit scaler ONLY on the training portion
+    scaled_data_full = scaler.transform(prices)  # Transform the entire dataset
     # Create the training sequences (from the part the scaler was fit on)
     train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
     x_train, y_train = [], []
     for i in range(60, len(train_sequences_source_data)):
         x_train.append(train_sequences_source_data[i-60:i, 0])
         y_train.append(train_sequences_source_data[i, 0])
+    if not x_train:  # Should be caught by earlier checks, but as a safeguard
         return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
     x_train, y_train = np.array(x_train), np.array(y_train)
     x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
     model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
     model.add(Dense(units=25))
     model.add(Dense(units=1))
     model.compile(optimizer='adam', loss='mean_squared_error')
     # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
     model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
               callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
     # --- Predictions ---
     # 1. Past predictions (on the training data part for visualization of fit)
     past_predictions_scaled = model.predict(x_train)
     past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
     past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
     # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
+    x_test_holdout = []
     for i in range(future_months_to_predict_on_holdout):
         seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
+        seq_end_idx = train_scaler_fit_size + i   # End of sequence relative to `prices`
+        x_test_holdout.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
+    x_test_holdout = np.array(x_test_holdout)
+    x_test_holdout = np.reshape(x_test_holdout, (x_test_holdout.shape[0], x_test_holdout.shape[1], 1))
+    holdout_predictions_scaled = model.predict(x_test_holdout)
     holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
     # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
     holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
+    # 3. Future predictions (beyond the available data)
+    future_predictions_scaled = []
+    last_sequence = scaled_data_full[-60:].reshape(1, 60, 1)
+    for _ in range(future_months_to_project):
+        predicted_scaled = model.predict(last_sequence)
+        future_predictions_scaled.append(predicted_scaled[0, 0])
+        last_sequence = np.concatenate([last_sequence[:, 1:, :], predicted_scaled.reshape(1, 1, 1)], axis=1)
+    future_predictions_actual_scale = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
+    last_actual_date = df_processed['Date'].iloc[-1]
+    future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_project + 1, freq='M')[1:]
+    future_predictions_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': future_predictions_actual_scale.flatten()})
     # --- Plotting ---
     fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
     fig.data[0].showlegend = True
     fig.data[0].name = 'Actual Price'
     for window in [3, 6, 12, 24]:
         fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
     # Plot past (training set) predictions
     if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
         fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
     # Plot predictions on the hold-out set
     if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
         fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
+    # Plot future predictions
+    fig.add_scatter(x=future_predictions_df['Date'], y=future_predictions_df['Predicted Price'], mode='lines', line=dict(color='green'), name='LSTM Future Predictions')
     fig.update_layout(legend_title_text='Legend')
     return fig
 # --- Gradio Interface ---
+iface = gr.Interface(
+    fn=plot_real_estate,
+    inputs=[
+        gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
+        gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1),
+        gr.Slider(label="Months to Predict into the Future", minimum=3, maximum=24, value=6, step=1)
+    ],
+    outputs=gr.Plot(),
+    title="Real Estate Price Analysis with LSTM Prediction",
+    description=lstm_explanation,
+    allow_flagging='never'
+)
 if __name__ == '__main__':
+    iface.launch(share=False, debug=True)