dibend commited on
Commit
5797f05
·
verified ·
1 Parent(s): d106ab7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -66
app.py CHANGED
@@ -10,104 +10,149 @@ from tensorflow.keras.callbacks import EarlyStopping
10
  lstm_explanation = """
11
  ## Understanding LSTM in This App
12
 
13
- **What is LSTM?**
14
- LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
15
 
16
- **How is it used here?**
17
- - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
18
  - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
19
- - The model learns trends, such as seasonal changes or long-term growth, and generates predictions for future months, shown as "Future Predictions" in the chart.
 
 
20
  """
21
 
22
- def plot_real_estate(zip, future_months=12):
 
 
 
 
 
23
  # Read the CSV file
24
- df = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
25
 
26
- # Extract the data for zip code
27
- df = df[df['RegionName'] == int(zip)]
28
 
29
- # Select the columns with dates
30
- df = df.loc[:, '2000-01-31':]
31
 
32
- # Transpose the data
33
- df = df.T.reset_index()
34
- df.columns = ['Date', 'Price']
 
 
 
35
 
36
- # Convert 'Date' to datetime
37
- df['Date'] = pd.to_datetime(df['Date'])
38
 
39
  # Compute the moving averages
40
- df['3-Month MA'] = df['Price'].rolling(3).mean()
41
- df['6-Month MA'] = df['Price'].rolling(6).mean()
42
- df['12-Month MA'] = df['Price'].rolling(12).mean()
43
- df['24-Month MA'] = df['Price'].rolling(24).mean()
44
-
45
- # Plot the price data and moving averages
46
- fig = px.line(df, x='Date', y='Price', title=f'Housing Prices for Zip Code {zip}')
47
- fig['data'][0]['showlegend'] = True
48
- fig['data'][0]['name'] = 'Price'
49
-
50
- fig.add_scatter(x=df['Date'], y=df['3-Month MA'], mode='lines', name='3-Month MA')
51
- fig.add_scatter(x=df['Date'], y=df['6-Month MA'], mode='lines', name='6-Month MA')
52
- fig.add_scatter(x=df['Date'], y=df['12-Month MA'], mode='lines', name='12-Month MA')
53
- fig.add_scatter(x=df['Date'], y=df['24-Month MA'], mode='lines', name='24-Month MA')
54
-
55
- # Prepare data for LSTM
56
  scaler = MinMaxScaler(feature_range=(0, 1))
57
- scaled_data = scaler.fit_transform(df['Price'].values.reshape(-1, 1))
 
 
58
 
59
- # Create the training data set
60
- train_data = scaled_data[:-future_months]
 
61
  x_train, y_train = [], []
62
- for i in range(60, len(train_data)):
63
- x_train.append(train_data[i-60:i, 0])
64
- y_train.append(train_data[i, 0])
 
 
 
 
65
  x_train, y_train = np.array(x_train), np.array(y_train)
66
  x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
67
 
68
  # Build the LSTM model
69
  model = Sequential()
70
  model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
71
- model.add(LSTM(units=50, return_sequences=False))
72
  model.add(Dense(units=25))
73
  model.add(Dense(units=1))
74
 
75
- # Compile the model
76
  model.compile(optimizer='adam', loss='mean_squared_error')
77
 
78
- # Train the model
79
- model.fit(x_train, y_train, batch_size=1, epochs=1, callbacks=[EarlyStopping(monitor='loss', patience=2)])
 
 
 
 
 
 
 
 
 
 
80
 
81
- # Test the model
82
- test_data = scaled_data[-(60+future_months):]
83
  x_test = []
84
- for i in range(60, len(test_data)):
85
- x_test.append(test_data[i-60:i, 0])
 
 
 
 
86
  x_test = np.array(x_test)
87
  x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
88
 
89
- predictions = model.predict(x_test)
90
- predictions = scaler.inverse_transform(predictions)
91
-
92
- # Past predictions for visualization
93
- past_predictions = model.predict(x_train)
94
- past_predictions = scaler.inverse_transform(past_predictions)
95
- past_dates = df['Date'].iloc[60:len(train_data)].reset_index(drop=True)
96
-
97
- future_dates = pd.date_range(start=df['Date'].iloc[-1], periods=future_months+1, freq='M')[1:]
98
- future_df = pd.DataFrame({'Date': future_dates, 'Predicted Price': predictions.flatten()})
99
-
100
- # Plot past predictions
101
- fig.add_scatter(x=past_dates, y=past_predictions.flatten(), mode='lines', line=dict(dash='dash'), name='Past Predictions')
102
-
103
- # Plot future predictions
104
- fig.add_scatter(x=future_df['Date'], y=future_df['Predicted Price'], mode='lines', name='Future Predictions')
105
-
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  return fig
107
 
 
108
  iface = gr.Interface(fn=plot_real_estate,
109
- inputs=[gr.Textbox(label="Zip Code"), gr.Slider(label="Months to Predict", minimum=1, maximum=24, step=1)],
 
 
 
110
  outputs=gr.Plot(),
111
- description=lstm_explanation)
 
 
112
 
113
- iface.launch(share=False, debug=True)
 
 
10
  lstm_explanation = """
11
  ## Understanding LSTM in This App
12
 
13
+ **What is LSTM?** LSTM (Long Short-Term Memory) is a type of neural network designed for time-series data, like housing prices. It excels at capturing patterns in sequential data, making it ideal for predicting future values based on historical trends.
 
14
 
15
+ **How is it used here?** - The LSTM model uses housing price data since January 2000 for the selected ZIP code.
 
16
  - It takes a 60-month lookback window (5 years) of historical prices to predict the next month's price.
17
+ - The model learns trends, such as seasonal changes or long-term growth.
18
+ - 'LSTM Fit on Training Data' shows how well the model learned the patterns in the historical data it was trained on.
19
+ - 'LSTM Predictions on Hold-out Data' shows the model's predictions for a recent period of actual prices that it wasn't trained on, to evaluate its forecasting ability.
20
  """
21
 
22
+ def plot_real_estate(zip_code_str, future_months_to_predict_on_holdout=12):
23
+ try:
24
+ zip_val = int(zip_code_str)
25
+ except ValueError:
26
+ return px.line(title=f"Invalid ZIP Code: '{zip_code_str}'. Please enter a numeric ZIP code.")
27
+
28
  # Read the CSV file
29
+ df_full = pd.read_csv('https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
30
 
31
+ # Extract the data for the given zip code
32
+ df_zip_subset = df_full[df_full['RegionName'] == zip_val]
33
 
34
+ if df_zip_subset.empty:
35
+ return px.line(title=f'No data found for Zip Code {zip_val}')
36
 
37
+ # Select the columns with dates and process
38
+ df_processed = df_zip_subset.loc[:, '2000-01-31':]
39
+ df_processed = df_processed.T.reset_index()
40
+ df_processed.columns = ['Date', 'Price']
41
+ df_processed['Date'] = pd.to_datetime(df_processed['Date'])
42
+ df_processed.dropna(subset=['Price'], inplace=True) # Remove rows with NaN prices if any
43
 
44
+ if len(df_processed['Price']) < 60 + future_months_to_predict_on_holdout:
45
+ return px.line(title=f'Not enough historical data for Zip Code {zip_val} (need at least {60 + future_months_to_predict_on_holdout} months of data).')
46
 
47
  # Compute the moving averages
48
+ for window in [3, 6, 12, 24]:
49
+ df_processed[f'{window}-Month MA'] = df_processed['Price'].rolling(window).mean()
50
+
51
+ # --- Prepare data for LSTM ---
52
+ prices = df_processed['Price'].values.reshape(-1, 1)
53
+
54
+ # Define split point for scaler fitting (all data except the hold-out "future" part)
55
+ # Ensure there's enough data to form at least one 60-month sequence for training
56
+ train_scaler_fit_size = len(prices) - future_months_to_predict_on_holdout
57
+ if train_scaler_fit_size < 60:
58
+ return px.line(title=f'Not enough data before hold-out period for Zip Code {zip_val} (need at least 60 months for LSTM lookback).')
59
+
60
+ train_prices_for_scaler = prices[:train_scaler_fit_size]
61
+
 
 
62
  scaler = MinMaxScaler(feature_range=(0, 1))
63
+ scaler.fit(train_prices_for_scaler) # Fit scaler ONLY on the training portion
64
+
65
+ scaled_data_full = scaler.transform(prices) # Transform the entire dataset
66
 
67
+ # Create the training sequences (from the part the scaler was fit on)
68
+ train_sequences_source_data = scaled_data_full[:train_scaler_fit_size]
69
+
70
  x_train, y_train = [], []
71
+ for i in range(60, len(train_sequences_source_data)):
72
+ x_train.append(train_sequences_source_data[i-60:i, 0])
73
+ y_train.append(train_sequences_source_data[i, 0])
74
+
75
+ if not x_train: # Should be caught by earlier checks, but as a safeguard
76
+ return px.line(title=f'Not enough data to form training sequences for Zip Code {zip_val}.')
77
+
78
  x_train, y_train = np.array(x_train), np.array(y_train)
79
  x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
80
 
81
  # Build the LSTM model
82
  model = Sequential()
83
  model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
84
+ model.add(LSTM(units=50, return_sequences=False)) # Can experiment with more layers or units
85
  model.add(Dense(units=25))
86
  model.add(Dense(units=1))
87
 
 
88
  model.compile(optimizer='adam', loss='mean_squared_error')
89
 
90
+ # Train the model - **RECOMMENDATION: Increase epochs and adjust patience**
91
+ # Example: epochs=50, patience=10. Using your original for direct comparison now.
92
+ model.fit(x_train, y_train, batch_size=1, epochs=1, # For better results, try epochs=50 or more
93
+ callbacks=[EarlyStopping(monitor='loss', patience=2)], verbose=0) # verbose=0 to suppress log spam in Gradio
94
+
95
+ # --- Predictions ---
96
+
97
+ # 1. Past predictions (on the training data part for visualization of fit)
98
+ past_predictions_scaled = model.predict(x_train)
99
+ past_predictions_actual_scale = scaler.inverse_transform(past_predictions_scaled)
100
+ # Dates for these past predictions align with y_train targets
101
+ past_pred_dates = df_processed['Date'].iloc[60 : len(train_sequences_source_data)].reset_index(drop=True)
102
 
103
+ # 2. Predictions on the hold-out set ("future_months_to_predict_on_holdout")
 
104
  x_test = []
105
+ # Create input sequences for the hold-out period
106
+ for i in range(future_months_to_predict_on_holdout):
107
+ seq_start_idx = train_scaler_fit_size - 60 + i # Start of sequence relative to `prices`
108
+ seq_end_idx = train_scaler_fit_size + i # End of sequence relative to `prices`
109
+ x_test.append(scaled_data_full[seq_start_idx:seq_end_idx, 0])
110
+
111
  x_test = np.array(x_test)
112
  x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
113
 
114
+ holdout_predictions_scaled = model.predict(x_test)
115
+ holdout_predictions_actual_scale = scaler.inverse_transform(holdout_predictions_scaled)
116
+
117
+ # Dates for these hold-out predictions are the last `future_months_to_predict_on_holdout` dates
118
+ holdout_pred_dates = df_processed['Date'].iloc[-future_months_to_predict_on_holdout:].reset_index(drop=True)
119
+
120
+ # --- Plotting ---
121
+ fig = px.line(df_processed, x='Date', y='Price', title=f'Housing Prices & LSTM Analysis for Zip Code {zip_val}')
122
+ fig.data[0].showlegend = True
123
+ fig.data[0].name = 'Actual Price'
124
+
125
+ for window in [3, 6, 12, 24]:
126
+ fig.add_scatter(x=df_processed['Date'], y=df_processed[f'{window}-Month MA'], mode='lines', name=f'{window}-Month MA')
127
+
128
+ # Plot past (training set) predictions
129
+ if len(past_pred_dates) == len(past_predictions_actual_scale.flatten()):
130
+ fig.add_scatter(x=past_pred_dates, y=past_predictions_actual_scale.flatten(), mode='lines', line=dict(dash='dash'), name='LSTM Fit on Training Data')
131
+
132
+ # Plot predictions on the hold-out set
133
+ if len(holdout_pred_dates) == len(holdout_predictions_actual_scale.flatten()):
134
+ fig.add_scatter(x=holdout_pred_dates, y=holdout_predictions_actual_scale.flatten(), mode='lines', line=dict(color='red'), name='LSTM Predictions on Hold-out Data')
135
+
136
+ # If you want to project dates *beyond* your current dataset for *iterative* future predictions (not done here):
137
+ # last_actual_date = df_processed['Date'].iloc[-1]
138
+ # projected_future_dates = pd.date_range(start=last_actual_date, periods=future_months_to_predict_on_holdout + 1, freq='ME')[1:]
139
+ # And then you would need an iterative prediction loop to generate data for `projected_future_dates`.
140
+ # Your original 'future_dates' and 'future_df' were plotting the hold-out predictions against such projected dates.
141
+ # The current plotting aligns hold-out predictions with their actual corresponding dates.
142
+
143
+ fig.update_layout(legend_title_text='Legend')
144
  return fig
145
 
146
+ # --- Gradio Interface ---
147
  iface = gr.Interface(fn=plot_real_estate,
148
+ inputs=[
149
+ gr.Textbox(label="Enter ZIP Code (e.g., 90210)"),
150
+ gr.Slider(label="Months for Hold-out Prediction", minimum=6, maximum=36, value=12, step=1)
151
+ ],
152
  outputs=gr.Plot(),
153
+ title="Real Estate Price Analysis with LSTM Prediction",
154
+ description=lstm_explanation,
155
+ allow_flagging='never')
156
 
157
+ if __name__ == '__main__':
158
+ iface.launch(share=False, debug=True) # share=True to create public link (if needed)