dibend commited on
Commit
dd5d3b4
·
verified ·
1 Parent(s): ab1aec6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -48
app.py CHANGED
@@ -10,16 +10,19 @@ from tensorflow.keras.callbacks import EarlyStopping
10
  lstm_explanation = """
11
  ## Understanding LSTM in This App
12
 
13
- **What is LSTM?** LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
 
14
 
15
- **How is it used here?** - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 
16
  - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
17
  - The model learns trends, such as seasonal changes or long-term growth.
18
  - 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
19
  - 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
 
20
  """
21
 
22
- def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
23
  try:
24
  zip_val = int(zip_code_str)
25
  except ValueError:
@@ -30,7 +33,6 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
30
 
31
  # Extract the data for the given zip code
32
  df_zip_subset = df_full[df_full['RegionName'] == zip_val]
33
-
34
  if df_zip_subset.empty:
35
  return px.line(title=f'No data found for Zip Code {zip_val}')
36
 
@@ -39,7 +41,7 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
39
  df_processed = df_processed.T.reset_index()
40
  df_processed.columns = ['Date', 'Price']
41
  df_processed['Date'] = pd.to_datetime(df_processed['Date'])
42
- df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
43
 
44
  if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
45
  return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
@@ -52,29 +54,24 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
52
  prices = df_processed['Price'].values.reshape(-1, 1)
53
 
54
  # Define split point for scaler fitting (all data except the hold-out "future" part)
55
- # Ensure there's enough data to form at least one 60-month sequence for training
56
  train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
57
  if train_scaler_fit_size < 60:
58
- return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
59
-
60
  train_prices_for_scaler = prices[:train_scaler_fit_size]
61
-
62
  scaler = MinMaxScaler(feature_range=(0, 1))
63
- scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
64
-
65
- scaled_data_full = scaler.transform(prices) # Transform the entire dataset
66
 
67
  # Create the training sequences (from the part the scaler was fit on)
68
  train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
69
-
70
  x_train, y_train = [], []
71
  for i in range(60, len(train_sequences_source_data)):
72
  x_train.append(train_sequences_source_data[i-60:i, 0])
73
  y_train.append(train_sequences_source_data[i, 0])
74
-
75
- if not x_train: # Should be caught by earlier checks, but as a safeguard
76
  return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
77
-
78
  x_train, y_train = np.array(x_train), np.array(y_train)
79
  x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
80
 
@@ -84,16 +81,13 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
84
  model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
85
  model.add(Dense(units=25))
86
  model.add(Dense(units=1))
87
-
88
  model.compile(optimizer='adam', loss='mean_squared_error')
89
 
90
  # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
91
- # Example: epochs=50, patience=10. Using your original for direct comparison now.
92
  model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
93
  callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
94
 
95
  # --- Predictions ---
96
-
97
  # 1. Past predictions (on the training data part for visualization of fit)
98
  past_predictions_scaled = model.predict(x_train)
99
  past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
@@ -101,30 +95,39 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
101
  past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
102
 
103
  # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
104
- x_test = []
105
- # Create input sequences for the hold-out period
106
  for i in range(future_months_to_predict_on_holdout):
107
  seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
108
- seq_end_idx = train_scaler_fit_size + i # End of sequence relative to `prices`
109
- x_test.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
110
-
111
- x_test = np.array(x_test)
112
- x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
113
-
114
- holdout_predictions_scaled = model.predict(x_test)
115
  holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
116
-
117
  # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
118
  holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # --- Plotting ---
121
  fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
122
  fig.data[0].showlegend = True
123
  fig.data[0].name = 'Actual Price'
124
-
125
  for window in [3, 6, 12, 24]:
126
  fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
127
-
128
  # Plot past (training set) predictions
129
  if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
130
  fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
@@ -132,27 +135,26 @@ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
132
  # Plot predictions on the hold-out set
133
  if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
134
  fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
135
-
136
- # If you want to project dates *beyond* your current dataset for *iterative* future predictions (not done here):
137
- # last_actual_date = df_processed['Date'].iloc[-1]
138
- # projected_future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_predict_on_holdout + 1, freq='ME')[1:]
139
- # And then you would need an iterative prediction loop to generate data for `projected_future_dates`.
140
- # Your original 'future_dates' and 'future_df' were plotting the hold-out predictions against such projected dates.
141
- # The current plotting aligns hold-out predictions with their actual corresponding dates.
142
 
143
  fig.update_layout(legend_title_text='Legend')
144
  return fig
145
 
146
  # --- Gradio Interface ---
147
- iface = gr.Interface(fn=plot_real_estate,
148
- inputs=[
149
- gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
150
- gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1)
151
- ],
152
- outputs=gr.Plot(),
153
- title="Real Estate Price Analysis with LSTM Prediction",
154
- description=lstm_explanation,
155
- allow_flagging='never')
 
 
 
156
 
157
  if __name__ == '__main__':
158
- iface.launch(share=False, debug=True) # share=True to create public link (if needed)
 
10
  lstm_explanation = """
11
  ## Understanding LSTM in This App
12
 
13
+ **What is LSTM?**
14
+ LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
15
 
16
+ **How is it used here?**
17
+ - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
18
  - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
19
  - The model learns trends, such as seasonal changes or long-term growth.
20
  - 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
21
  - 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
22
+ - 'LSTM Future Predictions' shows the model's predictions for months *beyond* the available historical data.
23
  """
24
 
25
+ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12, future_months_to_project=6):
26
  try:
27
  zip_val = int(zip_code_str)
28
  except ValueError:
 
33
 
34
  # Extract the data for the given zip code
35
  df_zip_subset = df_full[df_full['RegionName'] == zip_val]
 
36
  if df_zip_subset.empty:
37
  return px.line(title=f'No data found for Zip Code {zip_val}')
38
 
 
41
  df_processed = df_processed.T.reset_index()
42
  df_processed.columns = ['Date', 'Price']
43
  df_processed['Date'] = pd.to_datetime(df_processed['Date'])
44
+ df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
45
 
46
  if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
47
  return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
 
54
  prices = df_processed['Price'].values.reshape(-1, 1)
55
 
56
  # Define split point for scaler fitting (all data except the hold-out "future" part)
 
57
  train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
58
  if train_scaler_fit_size < 60:
59
+ return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
 
60
  train_prices_for_scaler = prices[:train_scaler_fit_size]
 
61
  scaler = MinMaxScaler(feature_range=(0, 1))
62
+ scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
63
+ scaled_data_full = scaler.transform(prices) # Transform the entire dataset
 
64
 
65
  # Create the training sequences (from the part the scaler was fit on)
66
  train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
 
67
  x_train, y_train = [], []
68
  for i in range(60, len(train_sequences_source_data)):
69
  x_train.append(train_sequences_source_data[i-60:i, 0])
70
  y_train.append(train_sequences_source_data[i, 0])
71
+
72
+ if not x_train: # Should be caught by earlier checks, but as a safeguard
73
  return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
74
+
75
  x_train, y_train = np.array(x_train), np.array(y_train)
76
  x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
77
 
 
81
  model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
82
  model.add(Dense(units=25))
83
  model.add(Dense(units=1))
 
84
  model.compile(optimizer='adam', loss='mean_squared_error')
85
 
86
  # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
 
87
  model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
88
  callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
89
 
90
  # --- Predictions ---
 
91
  # 1. Past predictions (on the training data part for visualization of fit)
92
  past_predictions_scaled = model.predict(x_train)
93
  past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
 
95
  past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
96
 
97
  # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
98
+ x_test_holdout = []
 
99
  for i in range(future_months_to_predict_on_holdout):
100
  seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
101
+ seq_end_idx = train_scaler_fit_size + i # End of sequence relative to `prices`
102
+ x_test_holdout.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
103
+ x_test_holdout = np.array(x_test_holdout)
104
+ x_test_holdout = np.reshape(x_test_holdout, (x_test_holdout.shape[0], x_test_holdout.shape[1], 1))
105
+ holdout_predictions_scaled = model.predict(x_test_holdout)
 
 
106
  holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
 
107
  # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
108
  holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
109
 
110
+ # 3. Future predictions (beyond the available data)
111
+ future_predictions_scaled = []
112
+ last_sequence = scaled_data_full[-60:].reshape(1, 60, 1)
113
+ for _ in range(future_months_to_project):
114
+ predicted_scaled = model.predict(last_sequence)
115
+ future_predictions_scaled.append(predicted_scaled[0, 0])
116
+ last_sequence = np.concatenate([last_sequence[:, 1:, :], predicted_scaled.reshape(1, 1, 1)], axis=1)
117
+
118
+ future_predictions_actual_scale = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
119
+ last_actual_date = df_processed['Date'].iloc[-1]
120
+ future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_project + 1, freq='M')[1:]
121
+
122
+ future_predictions_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': future_predictions_actual_scale.flatten()})
123
+
124
  # --- Plotting ---
125
  fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
126
  fig.data[0].showlegend = True
127
  fig.data[0].name = 'Actual Price'
 
128
  for window in [3, 6, 12, 24]:
129
  fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
130
+
131
  # Plot past (training set) predictions
132
  if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
133
  fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
 
135
  # Plot predictions on the hold-out set
136
  if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
137
  fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
138
+
139
+ # Plot future predictions
140
+ fig.add_scatter(x=future_predictions_df['Date'], y=future_predictions_df['Predicted Price'], mode='lines', line=dict(color='green'), name='LSTM Future Predictions')
 
 
 
 
141
 
142
  fig.update_layout(legend_title_text='Legend')
143
  return fig
144
 
145
  # --- Gradio Interface ---
146
+ iface = gr.Interface(
147
+ fn=plot_real_estate,
148
+ inputs=[
149
+ gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
150
+ gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1),
151
+ gr.Slider(label="Months to Predict into the Future", minimum=3, maximum=24, value=6, step=1)
152
+ ],
153
+ outputs=gr.Plot(),
154
+ title="Real Estate Price Analysis with LSTM Prediction",
155
+ description=lstm_explanation,
156
+ allow_flagging='never'
157
+ )
158
 
159
  if __name__ == '__main__':
160
+ iface.launch(share=False, debug=True)