Mpavan45 commited on
Commit
1a93c1f
·
verified ·
1 Parent(s): f600636

Update pages/4_EDA( Exploratory Data Analysis).py

Browse files
pages/4_EDA( Exploratory Data Analysis).py CHANGED
@@ -16,13 +16,14 @@ if data is not None:
16
  st.subheader("Dataset Preview:")
17
  st.write(data) # Display the first 5 rows
18
 
19
- st.subheader('DataFrame Info:')
 
20
  # Redirect the output of df.info() to a string buffer
21
  buffer = StringIO()
22
  data.info(buf=buffer)
23
 
24
  # Display the content in Streamlit
25
- st.text(buffer.getvalue())
26
 
27
  st.subheader("Dataset Statistics:")
28
  st.write(data.describe())
@@ -47,6 +48,14 @@ if data is not None:
47
 
48
  st.pyplot(fig)
49
 
 
 
 
 
 
 
 
 
50
  # Price, Cashback, and Discount Distribution
51
  st.subheader("Price, Cashback, and Discount Distribution")
52
  fig, axs = plt.subplots(1, 3, figsize=(16, 6))
@@ -66,6 +75,24 @@ if data is not None:
66
  axs[2].set_xlabel('Discount')
67
 
68
  st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Cancellation and State Distribution
71
  st.subheader("Cancellation and State Distribution")
@@ -82,6 +109,20 @@ if data is not None:
82
  axs[1].set_ylabel('Number of Hotels')
83
 
84
  st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # Category and Reviews Distribution
87
  st.subheader("Category and Reviews Distribution")
@@ -99,6 +140,19 @@ if data is not None:
99
  axs[1].set_ylabel('Number of Reviews')
100
 
101
  st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Top 10 Amenities
104
  st.subheader("Top 10 Amenities")
@@ -112,6 +166,224 @@ if data is not None:
112
  ax.set_ylabel('Amenity')
113
 
114
  st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  else:
117
  st.warning("No dataset found in session state. Please load the dataset into `st.session_state['data']`.")
 
16
  st.subheader("Dataset Preview:")
17
  st.write(data) # Display the first 5 rows
18
 
19
+
20
+ st.subheader("Info of the Dataset:")
21
  # Redirect the output of df.info() to a string buffer
22
  buffer = StringIO()
23
  data.info(buf=buffer)
24
 
25
  # Display the content in Streamlit
26
+ st.write(buffer.getvalue())
27
 
28
  st.subheader("Dataset Statistics:")
29
  st.write(data.describe())
 
48
 
49
  st.pyplot(fig)
50
 
51
+ # Hotel Star Insights
52
+ st.write("""
53
+ **Insight:**
54
+ - Majority of hotels in this data are 3-star hotels.
55
+ - Frequency of 4-star and 5-star hotels are also moderately good.
56
+ - 1-star and 2-star hotels are lower in frequency.
57
+ """)
58
+
59
  # Price, Cashback, and Discount Distribution
60
  st.subheader("Price, Cashback, and Discount Distribution")
61
  fig, axs = plt.subplots(1, 3, figsize=(16, 6))
 
75
  axs[2].set_xlabel('Discount')
76
 
77
  st.pyplot(fig)
78
+ # Histogram Insights
79
+ st.subheader("Plot-wise Analysis of Histograms")
80
+ st.write("""
81
+ **Price Distribution Insight:**
82
+ - The histogram is right-skewed, showing most properties are in the lower price range.
83
+ - A long tail indicates the presence of a few very expensive properties.
84
+
85
+ **Cashback Distribution Insight:**
86
+ - The histogram is right-skewed, with the majority of properties offering lower cashback amounts.
87
+ - Only a small number of properties provide higher cashback.
88
+
89
+ **Discount Distribution Insight:**
90
+ - The histogram is right-skewed, indicating that most properties offer lower discount percentages.
91
+ - A few properties stand out with higher discounts.
92
+
93
+ **Summary:**
94
+ The data suggest that Agoda properties are generally affordable, with lower cashback and discount offers being common. Further statistical analysis could help uncover more detailed insights.
95
+ """)
96
 
97
  # Cancellation and State Distribution
98
  st.subheader("Cancellation and State Distribution")
 
109
  axs[1].set_ylabel('Number of Hotels')
110
 
111
  st.pyplot(fig)
112
+ # Bar Chart Insights
113
+ st.subheader("Plot Wise Analysis of Bar Charts")
114
+ st.write("""
115
+ **Cancellations:**
116
+ - Most cancellations fall under category "1," indicating they occur within specific conditions or timeframes.
117
+
118
+ **State Distribution:**
119
+ - "Maharashtra" has the highest number of hotels, followed by "Madhya Pradesh."
120
+ - Other states like Gujarat, Karnataka, and Kerala also have notable hotel counts.
121
+ - The distribution is uneven, with some states having significantly more hotels.
122
+
123
+ **Summary:**
124
+ The charts highlight cancellation trends and the regional hotel distribution in India.
125
+ """)
126
 
127
  # Category and Reviews Distribution
128
  st.subheader("Category and Reviews Distribution")
 
140
  axs[1].set_ylabel('Number of Reviews')
141
 
142
  st.pyplot(fig)
143
+ # Hotel Categories and Reviews Insights
144
+ st.subheader("Plot Wise Analysis of Hotel Categories and Reviews")
145
+ st.write("""
146
+ **Category Distribution:**
147
+ - The histogram shows "Low Budget" hotels are the most common, followed by "Budget Hotels," while "Luxury Hotels" are the least common.
148
+
149
+ **Review Count Distribution:**
150
+ - The histogram is right-skewed, with most hotels having a low number of reviews.
151
+ - A few hotels have a very high number of reviews, evident from the long tail.
152
+
153
+ **Summary:**
154
+ The data indicates a higher concentration of low-budget hotels and relatively low review counts for most hotels.
155
+ """)
156
 
157
  # Top 10 Amenities
158
  st.subheader("Top 10 Amenities")
 
166
  ax.set_ylabel('Amenity')
167
 
168
  st.pyplot(fig)
169
+ # Top Amenities Insights
170
+ st.subheader("Plot Wise Analysis of Top Amenities")
171
+ st.write("""
172
+ **Common Amenities:**
173
+ - Complimentary Parking is the most frequently offered amenity.
174
+ - Basic Toiletries and Hair Dryers are also widely available.
175
+
176
+ **Less Common Amenities:**
177
+ - Fitness Center Access, Welcome Drinks, and Turndown Service are less common.
178
+ - Shoe Shine Service is the least frequently offered amenity.
179
+
180
+ **Summary:**
181
+ Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
182
+ """)
183
+
184
+ # Streamlit app
185
+ st.title("Bivariate Analysis")
186
+
187
+ # Price vs Rating scatter plot
188
+ st.subheader("Price vs Rating")
189
+ fig, ax = plt.subplots(figsize=(7, 5))
190
+ sns.scatterplot(x='rating', y='price', data=agoda_df, color='orange')
191
+ ax.set_title('Price vs Rating')
192
+ ax.set_xlabel('Rating')
193
+ ax.set_ylabel('Price')
194
+ st.pyplot(fig)
195
+
196
+ st.write("""
197
+ **Plot-Wise Analysis of Scatter Plots:**
198
+ - **Price vs. Rating:** Higher-priced hotels slightly tend to have better ratings, but ratings vary widely across price points.
199
+ - **Price vs. Discount:** Some high-priced hotels still provide discounts due to promotions or special deals.
200
+ - **Price vs. Cashback:** Exceptions exist due to promotional campaigns.
201
+ - **Price vs. Category:** "Luxury" hotels have the highest prices, followed by "Premium" and "Free & Easy." "Low Budget" and "Budget" hotels occupy the lower price range.
202
+ - **Summary:** Scatter plots reveal that higher-priced hotels generally offer better ratings but fewer discounts and cashback incentives. Lower-priced categories compensate with more promotional benefits.
203
+ """)
204
+
205
+ # Price vs Discount scatter plot
206
+ st.subheader("Price vs Discount")
207
+ fig, ax = plt.subplots(figsize=(7, 5))
208
+ sns.scatterplot(x='discount', y='price', data=agoda_df, color='green')
209
+ ax.set_title('Price vs Discount')
210
+ ax.set_xlabel('Discount')
211
+ ax.set_ylabel('Price')
212
+ st.pyplot(fig)
213
+
214
+ # Price vs Cashback scatter plot
215
+ st.subheader("Price vs Cashback")
216
+ fig, ax = plt.subplots(figsize=(7, 5))
217
+ sns.scatterplot(x='cashback', y='price', data=agoda_df, color='blue')
218
+ ax.set_title('Price vs Cashback')
219
+ ax.set_xlabel('Cashback')
220
+ ax.set_ylabel('Price')
221
+ st.pyplot(fig)
222
+
223
+ # Price vs Category bar plot
224
+ st.subheader("Price vs Category")
225
+ fig, ax = plt.subplots(figsize=(7, 5))
226
+ sns.barplot(x='category', y='price', data=agoda_df, palette='Set2')
227
+ ax.set_title('Price vs Category')
228
+ ax.set_xlabel('Category')
229
+ ax.set_ylabel('Price')
230
+ st.pyplot(fig)
231
+
232
+ # Rating vs Category bar plot
233
+ st.subheader("Rating vs Category")
234
+ fig, ax = plt.subplots(figsize=(7, 5))
235
+ sns.barplot(x='category', y='rating', data=agoda_df, palette='Set1')
236
+ ax.set_title('Rating vs Category')
237
+ ax.set_xlabel('Category')
238
+ ax.set_ylabel('Rating')
239
+ st.pyplot(fig)
240
+
241
+ # Discount vs Category box plot
242
+ st.subheader("Discount vs Category")
243
+ fig, ax = plt.subplots(figsize=(7, 5))
244
+ sns.boxplot(x='category', y='discount', data=agoda_df, palette='Set2')
245
+ ax.set_title('Discount vs Category')
246
+ ax.set_xlabel('Category')
247
+ ax.set_ylabel('Discount')
248
+ st.pyplot(fig)
249
+
250
+ # Cashback vs Category violin plot
251
+ st.subheader("Cashback vs Category")
252
+ fig, ax = plt.subplots(figsize=(7, 5))
253
+ sns.violinplot(x='category', y='cashback', data=agoda_df, palette='Set3')
254
+ ax.set_title('Cashback vs Category')
255
+ ax.set_xlabel('Category')
256
+ ax.set_ylabel('Cashback')
257
+ st.pyplot(fig)
258
+
259
+ # Reviews vs Category count plot
260
+ st.subheader("Reviews vs Category")
261
+ fig, ax = plt.subplots(figsize=(7, 5))
262
+ sns.countplot(x='category', data=agoda_df, palette='Set1')
263
+ ax.set_title('Reviews vs Category (Count)')
264
+ ax.set_xlabel('Category')
265
+ ax.set_ylabel('Count of Reviews')
266
+ st.pyplot(fig)
267
+
268
+ # Regional price analysis by state
269
+ st.subheader("Price by State")
270
+ fig, ax = plt.subplots(figsize=(16, 6))
271
+ sns.barplot(data=agoda_df, x='state', y='price', ax=ax, color='green')
272
+ ax.set_title('Price by State')
273
+ ax.tick_params(axis='x', rotation=90)
274
+ sns.set_palette('magma')
275
+ plt.tight_layout()
276
+ st.pyplot(fig)
277
+
278
+ # Regional category count by state
279
+ st.subheader("Category by State")
280
+ fig, ax = plt.subplots(figsize=(16, 6))
281
+ sns.countplot(data=agoda_df, x='state', hue='category', ax=ax, palette='Set1')
282
+ ax.set_title('Category by State')
283
+ ax.tick_params(axis='x', rotation=90)
284
+ plt.tight_layout()
285
+ st.pyplot(fig)
286
+
287
+ st.write("""
288
+ **Plot-Wise Analysis of Regional Price and Category Trends:**
289
+ - **Hotel Prices Across Indian States:** Prices vary significantly by state, reflecting regional differences in demand and supply. Certain states with popular tourist destinations show higher hotel prices.
290
+ - **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
291
+ - **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
292
+ """)
293
+
294
+
295
+ st.title("Multivariate Analysis of Hotel Data")
296
+
297
+ # Create a subset of the data for the analysis
298
+ subset_data = agoda_df[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
299
+
300
+ # Section 1: Price vs. Reviews by Category
301
+ st.header("Price vs. Reviews by Category")
302
+ fig1 = sns.catplot(data=agoda_df, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
303
+ fig1.set_axis_labels("Reviews", "Price")
304
+ fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
305
+ st.pyplot(fig1)
306
+
307
+ # Section 2: Price vs. Discount by Category
308
+ st.header("Price vs. Discount by Category")
309
+ fig2 = sns.catplot(data=agoda_df, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
310
+ fig2.set_axis_labels("Discount", "Price")
311
+ fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
312
+ st.pyplot(fig2)
313
+
314
+ # Section 3: Price vs Cashback and Rating by Category (Stripplot)
315
+ st.header("Price vs Cashback and Rating by Category")
316
+ fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
317
+
318
+ sns.stripplot(data=agoda_df, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
319
+ axes2[0].set_title('Price vs Cashback by Category')
320
+
321
+ sns.stripplot(data=agoda_df, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
322
+ axes2[1].set_title('Price vs Rating by Category')
323
+
324
+ st.pyplot(fig3)
325
+
326
+ # Insights and analysis
327
+ st.header("Plot-Wise Analysis Insights")
328
+
329
+ st.subheader("Price vs. Reviews by Category")
330
+ st.write("""
331
+ - Wide price ranges exist within each category, such as "Low Budget" having both low- and high-priced hotels.
332
+ - Slight tendency for hotels with more reviews to have higher prices, influenced by popularity and marketing efforts.
333
+ """)
334
+
335
+ st.subheader("Price vs. Discount by Category")
336
+ st.write("""
337
+ - Discounts decrease as hotel prices increase, confirming a negative correlation.
338
+ - "Low Budget" and "Budget" hotels offer higher discounts compared to "Premium" and "Luxury" categories.
339
+ """)
340
+
341
+ st.subheader("Price vs Cashback and Rating by Category")
342
+ st.write("""
343
+ - Lower-priced categories ("Budget" and "Low Budget") offer higher cashback incentives.
344
+ - Higher-priced categories ("Premium" and "Luxury") tend to have better ratings.
345
+ - Some lower-priced hotels achieve high ratings, indicating other factors like service quality influence customer satisfaction.
346
+ """)
347
+
348
+ # Section 4: Correlation Heatmap
349
+ st.header("Correlation Matrix Heatmap")
350
+ numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
351
+
352
+ # Compute the correlation matrix
353
+ correlation_matrix = agoda_df[numeric_columns].corr()
354
+
355
+ # Create a heatmap to visualize the correlation matrix
356
+ plt.figure(figsize=(10, 8))
357
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
358
+
359
+ # Set title for the plot
360
+ plt.title('Correlation Matrix Heatmap')
361
+
362
+ # Display the plot
363
+ st.pyplot(plt)
364
+
365
+ st.subheader("Correlation Matrix Heatmap")
366
+ st.write("""
367
+ - Strong negative correlation observed between price and discounts/cashbacks.
368
+ - Weak positive correlation between price and rating.
369
+ - Moderate positive correlation between reviews and ratings, and a weak positive correlation between reviews and price.
370
+ """)
371
+
372
+ st.header("Overall Summary")
373
+ st.write("""
374
+ - Most properties are affordable, with lower prices, cashback, and discounts dominating the dataset.
375
+ - Regional distribution shows states like Maharashtra and Madhya Pradesh having more hotels.
376
+ - The data reflects a market focused on affordability and basic amenities, with regional and category-specific variations.
377
+ - Cancellations and reviews provide further insights into customer behavior, while skewed distributions highlight potential outliers and trends in pricing and service offerings.
378
+ """)
379
+
380
+ st.header("Why Right-Skewed Trends Are Normal, Not Outliers")
381
+ st.write("""
382
+ - Right-skewed distributions for price, cashback, discounts, cancellations, reviews, and amenities are normal trends in the market.
383
+ - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
384
+ - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
385
+ """)
386
+
387
 
388
  else:
389
  st.warning("No dataset found in session state. Please load the dataset into `st.session_state['data']`.")