Mpavan45 commited on
Commit
557481d
·
verified ·
1 Parent(s): cc23e5b

Update pages/4_EDA( Exploratory Data Analysis).py

Browse files
pages/4_EDA( Exploratory Data Analysis).py CHANGED
@@ -180,14 +180,14 @@ if data is not None:
180
  **Summary:**
181
  Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
182
  """)
183
-
184
  # Streamlit app
185
  st.title("Bivariate Analysis")
186
 
187
  # Price vs Rating scatter plot
188
  st.subheader("Price vs Rating")
189
  fig, ax = plt.subplots(figsize=(7, 5))
190
- sns.scatterplot(x='rating', y='price', data=agoda_df, color='orange')
191
  ax.set_title('Price vs Rating')
192
  ax.set_xlabel('Rating')
193
  ax.set_ylabel('Price')
@@ -205,7 +205,7 @@ if data is not None:
205
  # Price vs Discount scatter plot
206
  st.subheader("Price vs Discount")
207
  fig, ax = plt.subplots(figsize=(7, 5))
208
- sns.scatterplot(x='discount', y='price', data=agoda_df, color='green')
209
  ax.set_title('Price vs Discount')
210
  ax.set_xlabel('Discount')
211
  ax.set_ylabel('Price')
@@ -214,7 +214,7 @@ if data is not None:
214
  # Price vs Cashback scatter plot
215
  st.subheader("Price vs Cashback")
216
  fig, ax = plt.subplots(figsize=(7, 5))
217
- sns.scatterplot(x='cashback', y='price', data=agoda_df, color='blue')
218
  ax.set_title('Price vs Cashback')
219
  ax.set_xlabel('Cashback')
220
  ax.set_ylabel('Price')
@@ -223,7 +223,7 @@ if data is not None:
223
  # Price vs Category bar plot
224
  st.subheader("Price vs Category")
225
  fig, ax = plt.subplots(figsize=(7, 5))
226
- sns.barplot(x='category', y='price', data=agoda_df, palette='Set2')
227
  ax.set_title('Price vs Category')
228
  ax.set_xlabel('Category')
229
  ax.set_ylabel('Price')
@@ -232,7 +232,7 @@ if data is not None:
232
  # Rating vs Category bar plot
233
  st.subheader("Rating vs Category")
234
  fig, ax = plt.subplots(figsize=(7, 5))
235
- sns.barplot(x='category', y='rating', data=agoda_df, palette='Set1')
236
  ax.set_title('Rating vs Category')
237
  ax.set_xlabel('Category')
238
  ax.set_ylabel('Rating')
@@ -241,7 +241,7 @@ if data is not None:
241
  # Discount vs Category box plot
242
  st.subheader("Discount vs Category")
243
  fig, ax = plt.subplots(figsize=(7, 5))
244
- sns.boxplot(x='category', y='discount', data=agoda_df, palette='Set2')
245
  ax.set_title('Discount vs Category')
246
  ax.set_xlabel('Category')
247
  ax.set_ylabel('Discount')
@@ -250,7 +250,7 @@ if data is not None:
250
  # Cashback vs Category violin plot
251
  st.subheader("Cashback vs Category")
252
  fig, ax = plt.subplots(figsize=(7, 5))
253
- sns.violinplot(x='category', y='cashback', data=agoda_df, palette='Set3')
254
  ax.set_title('Cashback vs Category')
255
  ax.set_xlabel('Category')
256
  ax.set_ylabel('Cashback')
@@ -259,7 +259,7 @@ if data is not None:
259
  # Reviews vs Category count plot
260
  st.subheader("Reviews vs Category")
261
  fig, ax = plt.subplots(figsize=(7, 5))
262
- sns.countplot(x='category', data=agoda_df, palette='Set1')
263
  ax.set_title('Reviews vs Category (Count)')
264
  ax.set_xlabel('Category')
265
  ax.set_ylabel('Count of Reviews')
@@ -268,7 +268,7 @@ if data is not None:
268
  # Regional price analysis by state
269
  st.subheader("Price by State")
270
  fig, ax = plt.subplots(figsize=(16, 6))
271
- sns.barplot(data=agoda_df, x='state', y='price', ax=ax, color='green')
272
  ax.set_title('Price by State')
273
  ax.tick_params(axis='x', rotation=90)
274
  sns.set_palette('magma')
@@ -278,7 +278,7 @@ if data is not None:
278
  # Regional category count by state
279
  st.subheader("Category by State")
280
  fig, ax = plt.subplots(figsize=(16, 6))
281
- sns.countplot(data=agoda_df, x='state', hue='category', ax=ax, palette='Set1')
282
  ax.set_title('Category by State')
283
  ax.tick_params(axis='x', rotation=90)
284
  plt.tight_layout()
@@ -290,23 +290,22 @@ if data is not None:
290
  - **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
291
  - **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
292
  """)
293
-
294
-
295
  st.title("Multivariate Analysis of Hotel Data")
296
 
297
  # Create a subset of the data for the analysis
298
- subset_data = agoda_df[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
299
 
300
  # Section 1: Price vs. Reviews by Category
301
  st.header("Price vs. Reviews by Category")
302
- fig1 = sns.catplot(data=agoda_df, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
303
  fig1.set_axis_labels("Reviews", "Price")
304
  fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
305
  st.pyplot(fig1)
306
 
307
  # Section 2: Price vs. Discount by Category
308
  st.header("Price vs. Discount by Category")
309
- fig2 = sns.catplot(data=agoda_df, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
310
  fig2.set_axis_labels("Discount", "Price")
311
  fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
312
  st.pyplot(fig2)
@@ -315,14 +314,14 @@ if data is not None:
315
  st.header("Price vs Cashback and Rating by Category")
316
  fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
317
 
318
- sns.stripplot(data=agoda_df, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
319
  axes2[0].set_title('Price vs Cashback by Category')
320
 
321
- sns.stripplot(data=agoda_df, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
322
  axes2[1].set_title('Price vs Rating by Category')
323
 
324
  st.pyplot(fig3)
325
-
326
  # Insights and analysis
327
  st.header("Plot-Wise Analysis Insights")
328
 
@@ -350,7 +349,7 @@ if data is not None:
350
  numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
351
 
352
  # Compute the correlation matrix
353
- correlation_matrix = agoda_df[numeric_columns].corr()
354
 
355
  # Create a heatmap to visualize the correlation matrix
356
  plt.figure(figsize=(10, 8))
@@ -383,6 +382,10 @@ if data is not None:
383
  - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
384
  - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
385
  """)
 
 
 
 
386
 
387
 
388
  else:
 
180
  **Summary:**
181
  Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
182
  """)
183
+
184
  # Streamlit app
185
  st.title("Bivariate Analysis")
186
 
187
  # Price vs Rating scatter plot
188
  st.subheader("Price vs Rating")
189
  fig, ax = plt.subplots(figsize=(7, 5))
190
+ sns.scatterplot(x='rating', y='price', data=data, color='orange')
191
  ax.set_title('Price vs Rating')
192
  ax.set_xlabel('Rating')
193
  ax.set_ylabel('Price')
 
205
  # Price vs Discount scatter plot
206
  st.subheader("Price vs Discount")
207
  fig, ax = plt.subplots(figsize=(7, 5))
208
+ sns.scatterplot(x='discount', y='price', data=data, color='green')
209
  ax.set_title('Price vs Discount')
210
  ax.set_xlabel('Discount')
211
  ax.set_ylabel('Price')
 
214
  # Price vs Cashback scatter plot
215
  st.subheader("Price vs Cashback")
216
  fig, ax = plt.subplots(figsize=(7, 5))
217
+ sns.scatterplot(x='cashback', y='price', data=data, color='blue')
218
  ax.set_title('Price vs Cashback')
219
  ax.set_xlabel('Cashback')
220
  ax.set_ylabel('Price')
 
223
  # Price vs Category bar plot
224
  st.subheader("Price vs Category")
225
  fig, ax = plt.subplots(figsize=(7, 5))
226
+ sns.barplot(x='category', y='price', data=data, palette='Set2')
227
  ax.set_title('Price vs Category')
228
  ax.set_xlabel('Category')
229
  ax.set_ylabel('Price')
 
232
  # Rating vs Category bar plot
233
  st.subheader("Rating vs Category")
234
  fig, ax = plt.subplots(figsize=(7, 5))
235
+ sns.barplot(x='category', y='rating', data=data, palette='Set1')
236
  ax.set_title('Rating vs Category')
237
  ax.set_xlabel('Category')
238
  ax.set_ylabel('Rating')
 
241
  # Discount vs Category box plot
242
  st.subheader("Discount vs Category")
243
  fig, ax = plt.subplots(figsize=(7, 5))
244
+ sns.boxplot(x='category', y='discount', data=data, palette='Set2')
245
  ax.set_title('Discount vs Category')
246
  ax.set_xlabel('Category')
247
  ax.set_ylabel('Discount')
 
250
  # Cashback vs Category violin plot
251
  st.subheader("Cashback vs Category")
252
  fig, ax = plt.subplots(figsize=(7, 5))
253
+ sns.violinplot(x='category', y='cashback', data=data, palette='Set3')
254
  ax.set_title('Cashback vs Category')
255
  ax.set_xlabel('Category')
256
  ax.set_ylabel('Cashback')
 
259
  # Reviews vs Category count plot
260
  st.subheader("Reviews vs Category")
261
  fig, ax = plt.subplots(figsize=(7, 5))
262
+ sns.countplot(x='category', data=data, palette='Set1')
263
  ax.set_title('Reviews vs Category (Count)')
264
  ax.set_xlabel('Category')
265
  ax.set_ylabel('Count of Reviews')
 
268
  # Regional price analysis by state
269
  st.subheader("Price by State")
270
  fig, ax = plt.subplots(figsize=(16, 6))
271
+ sns.barplot(data=data, x='state', y='price', ax=ax, color='green')
272
  ax.set_title('Price by State')
273
  ax.tick_params(axis='x', rotation=90)
274
  sns.set_palette('magma')
 
278
  # Regional category count by state
279
  st.subheader("Category by State")
280
  fig, ax = plt.subplots(figsize=(16, 6))
281
+ sns.countplot(data=data, x='state', hue='category', ax=ax, palette='Set1')
282
  ax.set_title('Category by State')
283
  ax.tick_params(axis='x', rotation=90)
284
  plt.tight_layout()
 
290
  - **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
291
  - **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
292
  """)
293
+
 
294
  st.title("Multivariate Analysis of Hotel Data")
295
 
296
  # Create a subset of the data for the analysis
297
+ subset_data = data[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
298
 
299
  # Section 1: Price vs. Reviews by Category
300
  st.header("Price vs. Reviews by Category")
301
+ fig1 = sns.catplot(data=data, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
302
  fig1.set_axis_labels("Reviews", "Price")
303
  fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
304
  st.pyplot(fig1)
305
 
306
  # Section 2: Price vs. Discount by Category
307
  st.header("Price vs. Discount by Category")
308
+ fig2 = sns.catplot(data=data, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
309
  fig2.set_axis_labels("Discount", "Price")
310
  fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
311
  st.pyplot(fig2)
 
314
  st.header("Price vs Cashback and Rating by Category")
315
  fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
316
 
317
+ sns.stripplot(data=data, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
318
  axes2[0].set_title('Price vs Cashback by Category')
319
 
320
+ sns.stripplot(data=data, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
321
  axes2[1].set_title('Price vs Rating by Category')
322
 
323
  st.pyplot(fig3)
324
+
325
  # Insights and analysis
326
  st.header("Plot-Wise Analysis Insights")
327
 
 
349
  numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
350
 
351
  # Compute the correlation matrix
352
+ correlation_matrix = data[numeric_columns].corr()
353
 
354
  # Create a heatmap to visualize the correlation matrix
355
  plt.figure(figsize=(10, 8))
 
382
  - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
383
  - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
384
  """)
385
+ st.write("""
386
+ Since no outliers were detected, we can proceed with model training and selection.
387
+ With clean data, we can now focus on choosing the best algorithm, tuning hyperparameters, and evaluating model performance.
388
+ """)
389
 
390
 
391
  else: