Mpavan45 commited on
Commit
a288117
·
verified ·
1 Parent(s): 494cbbd

Update pages/3_EDA and Feature Engineering.py

Browse files
Files changed (1) hide show
  1. pages/3_EDA and Feature Engineering.py +471 -472
pages/3_EDA and Feature Engineering.py CHANGED
@@ -99,479 +99,478 @@ if data is not None:
99
  st.success("Dataset successfully loaded from session state!")
100
 
101
  st.subheader("Univariate Analysis")
102
-
103
- # Rating and Review Text Distribution
104
- st.subheader("Rating and Review Text Distribution")
105
- fig, axs = plt.subplots(1, 2, figsize=(16, 6))
106
-
107
- data["rating"].value_counts().plot(kind='pie', title='Distribution of Ratings', autopct='%1.1f%%', shadow=True, startangle=45, textprops={'size': 'x-large'}, ax=axs[0])
108
- axs[0].set_title("Distribution of Ratings")
109
-
110
- data['review_text'].value_counts().plot(kind='pie', title='Distribution of Review Text', autopct='%1.1f%%', shadow=True, startangle=45, textprops={'size': 'x-large'}, ax=axs[1])
111
- axs[1].set_title("Distribution of Review Text")
112
-
113
- st.pyplot(fig)
114
-
115
- # Hotel Star Insights
116
- st.write("""
117
- **Insight:**
118
- - Majority of hotels in this data are 3-star hotels.
119
- - Frequency of 4-star and 5-star hotels are also moderately good.
120
- - 1-star and 2-star hotels are lower in frequency.
121
- """)
122
-
123
- # Price, Cashback, and Discount Distribution
124
- st.subheader("Price, Cashback, and Discount Distribution")
125
- fig, axs = plt.subplots(1, 3, figsize=(16, 6))
126
-
127
- sns.histplot(data=data, x='price', color='green', kde=True, ax=axs[0])
128
- axs[0].set_title("Count based on price")
129
- axs[0].set_xlabel('Price')
130
- axs[0].set_ylabel('Number of People')
131
-
132
- sns.histplot(data=data, x='cashback', color='violet', kde=True, ax=axs[1])
133
- axs[1].set_title("Count based on cashback")
134
- axs[1].set_xlabel('Cashback')
135
- axs[1].set_ylabel('Number of People')
136
-
137
- sns.histplot(data=data, x='discount', color='orange', kde=True, ax=axs[2])
138
- axs[2].set_title("Count based on discount")
139
- axs[2].set_xlabel('Discount')
140
-
141
- st.pyplot(fig)
142
- # Histogram Insights
143
- st.subheader("Plot-wise Analysis of Histograms")
144
- st.write("""
145
- **Price Distribution Insight:**
146
- - The histogram is right-skewed, showing most properties are in the lower price range.
147
- - A long tail indicates the presence of a few very expensive properties.
148
-
149
- **Cashback Distribution Insight:**
150
- - The histogram is right-skewed, with the majority of properties offering lower cashback amounts.
151
- - Only a small number of properties provide higher cashback.
152
-
153
- **Discount Distribution Insight:**
154
- - The histogram is right-skewed, indicating that most properties offer lower discount percentages.
155
- - A few properties stand out with higher discounts.
156
-
157
- **Summary:**
158
- The data suggest that Agoda properties are generally affordable, with lower cashback and discount offers being common. Further statistical analysis could help uncover more detailed insights.
159
- """)
160
-
161
- # Cancellation and State Distribution
162
- st.subheader("Cancellation and State Distribution")
163
- fig, axs = plt.subplots(1, 2, figsize=(16, 6))
164
-
165
- data["cancellation"].value_counts().plot(kind='bar', title='Distribution of Cancellation', color='red', ax=axs[0])
166
- axs[0].set_title("Distribution of Cancellation")
167
- axs[0].set_xlabel('Cancellation')
168
- axs[0].set_ylabel('Number of Hotels')
169
-
170
- data["state"].value_counts().plot(kind='bar', title='Distribution of State', color='black', ax=axs[1])
171
- axs[1].set_title("Distribution of State")
172
- axs[1].set_xlabel('State')
173
- axs[1].set_ylabel('Number of Hotels')
174
-
175
- st.pyplot(fig)
176
- # Bar Chart Insights
177
- st.subheader("Plot Wise Analysis of Bar Charts")
178
- st.write("""
179
- **Cancellations:**
180
- - Most cancellations fall under category "1," indicating they occur within specific conditions or timeframes.
181
-
182
- **State Distribution:**
183
- - "Maharashtra" has the highest number of hotels, followed by "Madhya Pradesh."
184
- - Other states like Gujarat, Karnataka, and Kerala also have notable hotel counts.
185
- - The distribution is uneven, with some states having significantly more hotels.
186
-
187
- **Summary:**
188
- The charts highlight cancellation trends and the regional hotel distribution in India.
189
- """)
190
-
191
- # Category and Reviews Distribution
192
- st.subheader("Category and Reviews Distribution")
193
- fig, axs = plt.subplots(1, 2, figsize=(16, 6))
194
-
195
- colors = sns.color_palette('Set2', n_colors=len(data["category"].value_counts()))
196
- data["category"].value_counts().plot(kind='bar', ax=axs[0], color=colors)
197
- axs[0].set_title("Distribution of Category")
198
- axs[0].set_xlabel('Category')
199
- axs[0].set_ylabel('Number of Hotels')
200
-
201
- sns.histplot(data=data, x='reviews', color='violet', kde=True, ax=axs[1])
202
- axs[1].set_title("Count based on Reviews")
203
- axs[1].set_xlabel('Reviews')
204
- axs[1].set_ylabel('Number of Reviews')
205
-
206
- st.pyplot(fig)
207
- # Hotel Categories and Reviews Insights
208
- st.subheader("Plot Wise Analysis of Hotel Categories and Reviews")
209
- st.write("""
210
- **Category Distribution:**
211
- - The histogram shows "Low Budget" hotels are the most common, followed by "Budget Hotels," while "Luxury Hotels" are the least common.
212
-
213
- **Review Count Distribution:**
214
- - The histogram is right-skewed, with most hotels having a low number of reviews.
215
- - A few hotels have a very high number of reviews, evident from the long tail.
216
-
217
- **Summary:**
218
- The data indicates a higher concentration of low-budget hotels and relatively low review counts for most hotels.
219
- """)
220
-
221
- # Top 10 Amenities
222
- st.subheader("Top 10 Amenities")
223
- amenity_counts = data['free_services'].str.split(',').explode().str.strip().value_counts().reset_index()
224
- amenity_counts.columns = ['Amenity', 'Count']
225
-
226
- fig, ax = plt.subplots(figsize=(10, 6))
227
- sns.barplot(x='Count', y='Amenity', data=amenity_counts.head(10), palette='viridis', ax=ax)
228
- ax.set_title('Top 10 Amenities')
229
- ax.set_xlabel('Number of Hotels Offering')
230
- ax.set_ylabel('Amenity')
231
-
232
- st.pyplot(fig)
233
- # Top Amenities Insights
234
- st.subheader("Plot Wise Analysis of Top Amenities")
235
- st.write("""
236
- **Common Amenities:**
237
- - Complimentary Parking is the most frequently offered amenity.
238
- - Basic Toiletries and Hair Dryers are also widely available.
239
-
240
- **Less Common Amenities:**
241
- - Fitness Center Access, Welcome Drinks, and Turndown Service are less common.
242
- - Shoe Shine Service is the least frequently offered amenity.
243
-
244
- **Summary:**
245
- Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
246
- """)
247
-
248
- st.title("Bivariate Analysis")
249
-
250
- # Price vs Rating scatter plot
251
- st.subheader("Price vs Rating")
252
- fig, ax = plt.subplots(figsize=(7, 5))
253
- sns.scatterplot(x='rating', y='price', data=data, color='orange')
254
- ax.set_title('Price vs Rating')
255
- ax.set_xlabel('Rating')
256
- ax.set_ylabel('Price')
257
- st.pyplot(fig)
258
-
259
- # Price vs Rating
260
- st.subheader("Price vs Rating:")
261
- st.write("""
262
- - **Analysis:**
263
- - Higher-priced hotels slightly tend to have better ratings, but ratings vary widely across price points.
264
- - Hotels at various price points exhibit a large spread in ratings, meaning factors other than price (such as customer experience or amenities) contribute significantly to the rating.
265
- """)
266
-
267
- # Price vs Discount scatter plot
268
- st.subheader("Price vs Discount")
269
- fig, ax = plt.subplots(figsize=(7, 5))
270
- sns.scatterplot(x='discount', y='price', data=data, color='green')
271
- ax.set_title('Price vs Discount')
272
- ax.set_xlabel('Discount')
273
- ax.set_ylabel('Price')
274
- st.pyplot(fig)
275
- # Price vs Discount
276
- st.subheader("Price vs Discount:")
277
- st.write("""
278
- - **Analysis:**
279
- - Some high-priced hotels still provide discounts due to promotions or special deals.
280
- - This observation suggests that while premium hotels may not always need to offer discounts to attract customers, they occasionally use them as a marketing strategy or for seasonal promotions.
281
- """)
282
-
283
- # Price vs Cashback scatter plot
284
- st.subheader("Price vs Cashback")
285
- fig, ax = plt.subplots(figsize=(7, 5))
286
- sns.scatterplot(x='cashback', y='price', data=data, color='blue')
287
- ax.set_title('Price vs Cashback')
288
- ax.set_xlabel('Cashback')
289
- ax.set_ylabel('Price')
290
- st.pyplot(fig)
291
-
292
- # Price vs Cashback
293
- st.subheader("Price vs Cashback:")
294
- st.write("""
295
- - **Analysis:**
296
- - Exceptions exist due to promotional campaigns.
297
- - While higher-priced hotels generally offer fewer cashback incentives, some may offer cashback due to specific promotional campaigns aimed at increasing sales volume or attracting customers in a competitive market.
298
- """)
299
-
300
- # Price vs Category bar plot
301
- st.subheader("Price vs Category")
302
- fig, ax = plt.subplots(figsize=(7, 5))
303
- sns.barplot(x='category', y='price', data=data, palette='Set2')
304
- ax.set_title('Price vs Category')
305
- ax.set_xlabel('Category')
306
- ax.set_ylabel('Price')
307
- st.pyplot(fig)
308
- # Price vs Category
309
- st.subheader("Price vs Category:")
310
- st.write("""
311
- - **Analysis:**
312
- - "Luxury" hotels have the highest prices, followed by "Premium" and "Free & Easy."
313
- - "Low Budget" and "Budget" hotels occupy the lower price range, showing that the category directly influences pricing strategy.
314
- - Categories like "Luxury" and "Premium" aim to target a specific market willing to pay more for superior quality and services, while "Budget" and "Low Budget" cater to a price-sensitive segment.
315
- """)
316
- # Summary of Scatter Plot Analysis
317
- st.subheader("Summary of Scatter Plot Analysis:")
318
- st.write("""
319
- - **Price vs Rating:** Higher-priced hotels generally offer better ratings, but the ratings vary widely across price points, indicating that factors such as service quality and amenities matter significantly.
320
- - **Price vs Discount:** High-priced hotels may still offer discounts due to seasonal promotions or special offers.
321
- - **Price vs Cashback:** Although high-priced hotels generally offer fewer cashback incentives, there are exceptions driven by promotional campaigns.
322
- - **Price vs Category:** "Luxury" hotels are the most expensive, followed by "Premium" and "Free & Easy" categories. On the other hand, "Low Budget" and "Budget" hotels have lower prices.
323
- - **Overall Insight:** The scatter plots reveal trends where higher-priced hotels tend to offer better ratings but fewer discounts and cashback incentives, while lower-priced categories tend to provide more promotional benefits such as discounts and cashback.
324
- """)
325
-
326
- # Rating vs Category bar plot
327
- st.subheader("Rating vs Category")
328
- fig, ax = plt.subplots(figsize=(7, 5))
329
- sns.barplot(x='category', y='rating', data=data, palette='Set1')
330
- ax.set_title('Rating vs Category')
331
- ax.set_xlabel('Category')
332
- ax.set_ylabel('Rating')
333
- st.pyplot(fig)
334
- # Rating vs Category
335
- st.subheader("Rating vs Category:")
336
- st.write("""
337
- - **Analysis:**
338
- - "Luxury" hotels lead in average ratings, followed by "Premium" hotels.
339
- - "Budget" and "Low Budget" categories show lower average ratings, indicating that these hotels may focus on price rather than offering premium services or experiences.
340
- - The disparity in ratings shows that customers tend to have higher expectations for luxury and premium accommodations, which are reflected in the ratings.
341
- """)
342
-
343
- # Discount vs Category box plot
344
- st.subheader("Discount vs Category")
345
- fig, ax = plt.subplots(figsize=(7, 5))
346
- sns.boxplot(x='category', y='discount', data=data, palette='Set2')
347
- ax.set_title('Discount vs Category')
348
- ax.set_xlabel('Category')
349
- ax.set_ylabel('Discount')
350
- st.pyplot(fig)
351
- # Discount vs Category
352
- st.subheader("Discount vs Category:")
353
- st.write("""
354
- - **Analysis:**
355
- - "Low Budget" hotels offer the highest discounts, while "Luxury" hotels provide the least discounts.
356
- - This suggests that budget-friendly hotels use discounts as a key strategy to attract price-sensitive customers, whereas luxury hotels focus on providing premium experiences without relying on price reductions.
357
- - The discount strategy varies by category, with lower-priced categories incentivizing customers with discounts to stay competitive.
358
- """)
359
- # Cashback vs Category violin plot
360
- st.subheader("Cashback vs Category")
361
- fig, ax = plt.subplots(figsize=(7, 5))
362
- sns.violinplot(x='category', y='cashback', data=data, palette='Set3')
363
- ax.set_title('Cashback vs Category')
364
- ax.set_xlabel('Category')
365
- ax.set_ylabel('Cashback')
366
- st.pyplot(fig)
367
- # Cashback vs Category
368
- st.subheader("Cashback vs Category:")
369
- st.write("""
370
- - **Analysis:**
371
- - Higher cashback offers are more common in "Low Budget" hotels, as these hotels rely on cashback incentives to attract customers looking for value deals.
372
- - Luxury hotels rarely provide cashback, as their target market is less likely to be motivated by such offers.
373
- - The trend highlights the different strategies employed by each category: budget options often provide financial incentives like cashback to drive bookings, while luxury options focus on premium services.
374
- """)
375
-
376
- # Reviews vs Category count plot
377
- st.subheader("Reviews vs Category")
378
- fig, ax = plt.subplots(figsize=(7, 5))
379
- sns.countplot(x='category', data=data, palette='Set1')
380
- ax.set_title('Reviews vs Category (Count)')
381
- ax.set_xlabel('Category')
382
- ax.set_ylabel('Count of Reviews')
383
- st.pyplot(fig)
384
- # Reviews vs Category
385
- st.subheader("Reviews vs Category:")
386
- st.write("""
387
- - **Analysis:**
388
- - "Luxury" hotels attract the most reviews, indicating that higher-quality accommodations often receive more feedback from customers.
389
- - "Budget" and "Low Budget" hotels tend to have fewer reviews, which may be due to their more straightforward offerings and smaller customer base.
390
- - This trend suggests that customers who opt for luxury hotels are more likely to share their experiences, whereas budget options may attract fewer repeat customers or have less word-of-mouth influence.
391
- """)
392
- # Summary of Bar and Box Plot Analysis
393
- st.subheader("Summary of Bar and Box Plot Analysis:")
394
- st.write("""
395
- - **Rating vs Category:** "Luxury" and "Premium" hotels have higher ratings on average, while "Budget" and "Low Budget" hotels show lower ratings.
396
- - **Discount vs Category:** Budget hotels, especially "Low Budget," offer higher discounts, while luxury hotels offer fewer discounts, relying more on their value proposition.
397
- - **Cashback vs Category:** "Low Budget" hotels offer higher cashback incentives, while luxury hotels rarely provide cashback, highlighting the pricing strategies in different categories.
398
- - **Reviews vs Category:** "Luxury" hotels attract the most reviews, while "Budget" and "Low Budget" hotels attract fewer.
399
- - **Overall Insight:** Bar and box plots reveal that higher-rated and more reviewed hotels tend to offer fewer discounts or cashback, focusing on customer experience, while budget categories focus on providing customer incentives to compete in the market.
400
- """)
401
- # Regional price analysis by state
402
- st.subheader("Price by State")
403
- fig, ax = plt.subplots(figsize=(16, 6))
404
- sns.barplot(data=data, x='state', y='price', ax=ax, color='green')
405
- ax.set_title('Price by State')
406
- ax.tick_params(axis='x', rotation=90)
407
- sns.set_palette('magma')
408
- plt.tight_layout()
409
- st.pyplot(fig)
410
- # Hotel Prices Across Indian States
411
- st.subheader("Hotel Prices Across Indian States:")
412
- st.write("""
413
- - **Analysis:**
414
- - Hotel prices vary significantly across Indian states, reflecting regional differences in demand and supply.
415
- - States with popular tourist destinations, such as Goa and Rajasthan, tend to show higher hotel prices, as they attract more visitors and have a higher demand for accommodations.
416
- - Conversely, states with less tourism or lower demand may exhibit more affordable pricing, catering to the local population or budget travelers.
417
- - Price differences also reflect factors such as local economic conditions, infrastructure, and tourism policies in each state.
418
- """)
419
-
420
- # Regional category count by state
421
- st.subheader("Category by State")
422
- fig, ax = plt.subplots(figsize=(16, 6))
423
- sns.countplot(data=data, x='state', hue='category', ax=ax, palette='Set1')
424
- ax.set_title('Category by State')
425
- ax.tick_params(axis='x', rotation=90)
426
- plt.tight_layout()
427
- st.pyplot(fig)
428
- # Hotel Categories by State
429
- st.subheader("Hotel Categories by State:")
430
- st.write("""
431
- - **Analysis:**
432
- - States with a higher concentration of "Low Budget" and "Budget" hotels cater primarily to cost-conscious travelers, offering affordable accommodations for a wide range of customers.
433
- - States with more "Luxury" hotels are likely to be major tourist hubs, such as Delhi, Mumbai, and Kerala, or regions that cater to premium audiences, offering high-end services for affluent customers.
434
- - These states may also focus on attracting international tourists or business travelers who prefer premium amenities and luxury experiences.
435
- """)
436
-
437
- # Summary of Regional Price and Category Trends
438
- st.subheader("Summary of Regional Price and Category Trends:")
439
- st.write("""
440
- - **Hotel Prices Across Indian States:** Prices vary significantly depending on the state, with tourist-heavy regions showing higher prices due to greater demand.
441
- - **Hotel Categories by State:** States with more budget hotels focus on catering to price-sensitive travelers, while states with luxury hotels cater to premium audiences, often in tourist hotspots.
442
- - **Overall Insight:** Regional trends indicate diverse pricing and category distributions, influenced by tourism, regional economics, and state-specific factors that shape hotel offerings across the country.
443
- """)
444
-
445
- st.title("Multivariate Analysis of Hotel Data")
446
-
447
- # Create a subset of the data for the analysis
448
- subset_data = data[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
449
-
450
- # Section 1: Price vs. Reviews by Category
451
- st.header("Price vs. Reviews by Category")
452
- fig1 = sns.catplot(data=data, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
453
- fig1.set_axis_labels("Reviews", "Price")
454
- fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
455
- st.pyplot(fig1)
456
- # Analysis Text for Price vs. Reviews by Category
457
- st.write("""
458
- - **Price Variation within Categories:**
459
- - Wide price ranges exist within each category, with "Low Budget" hotels featuring both low- and high-priced options.
460
- - This shows that pricing within categories isn't always uniform and may depend on other factors like location, amenities, and hotel size.
461
-
462
- - **Price and Reviews Relationship:**
463
- - There's a slight tendency for hotels with more reviews to have higher prices, possibly due to the influence of popularity, better marketing efforts, or higher quality services.
464
-
465
- - **Summary:**
466
- - The stripplot reveals a weak positive correlation between price and reviews, indicating that well-reviewed hotels tend to have higher prices.
467
- """)
468
-
469
- # Section 2: Price vs. Discount by Category
470
- st.header("Price vs. Discount by Category")
471
- fig2 = sns.catplot(data=data, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
472
- fig2.set_axis_labels("Discount", "Price")
473
- fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
474
- st.pyplot(fig2)
475
- # Analysis Text for Price vs. Discount by Category
476
- st.write("""
477
- - **Price and Discount Relationship:**
478
- - The stripplot clearly shows that as hotel prices increase, discounts tend to decrease. This confirms a negative correlation between price and discount.
479
-
480
- - **Category-Specific Trends:**
481
- - "Low Budget" and "Budget" hotels offer much higher discounts compared to "Premium" and "Luxury" hotels, which typically offer fewer or smaller discounts.
482
- - This trend highlights that budget-conscious categories use higher discounts to attract customers, whereas premium and luxury categories rely on factors other than discounts (e.g., quality, exclusivity) to appeal to their clientele.
483
-
484
- - **Summary:**
485
- - The plot confirms that lower-priced hotels use higher discounts to attract customers, while premium and luxury hotels maintain lower discount rates, aligning with typical market behavior.
486
- """)
487
 
488
-
489
- # Section 3: Price vs Cashback and Rating by Category (Stripplot)
490
- st.header("Price vs Cashback and Rating by Category")
491
- fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
492
-
493
- sns.stripplot(data=data, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
494
- axes2[0].set_title('Price vs Cashback by Category')
495
-
496
- sns.stripplot(data=data, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
497
- axes2[1].set_title('Price vs Rating by Category')
498
-
499
- st.pyplot(fig3)
500
-
501
-
502
- # Analysis Text for Price vs Cashback by Category
503
- st.write("""
504
- - **Price vs Cashback by Category:**
505
- - The plot shows that cashback incentives tend to decrease as hotel prices increase. However, variations exist due to promotional offers that may affect cashback amounts.
506
- - Lower-priced categories, such as "Budget" and "Low Budget," offer higher cashback incentives to attract price-sensitive customers.
507
-
508
- - **Summary:**
509
- - The stripplot reveals a clear trend where lower-priced hotels use cashback as an incentive to boost bookings, whereas higher-priced hotels focus on other value propositions (e.g., premium services).
510
- """)
511
- # Analysis Text for Price vs Rating by Category
512
- st.write("""
513
- - **Price vs Rating by Category:**
514
- - Higher-priced categories like "Premium" and "Luxury" tend to have better ratings, indicating that customers perceive these hotels as providing superior value.
515
- - Interestingly, some lower-priced hotels achieve high ratings, suggesting that other factors such as service quality and customer experience may contribute to higher satisfaction despite the lower price point.
516
-
517
- - **Summary:**
518
- - The stripplot shows that while higher-priced hotels emphasize quality to achieve better ratings, lower-priced hotels still manage to deliver satisfactory experiences for customers through factors like service quality.
519
- """)
520
- # Section 4: Correlation Heatmap
521
- st.header("Correlation Matrix Heatmap")
522
- numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
523
-
524
- # Compute the correlation matrix
525
- correlation_matrix = data[numeric_columns].corr()
526
-
527
- # Create a heatmap to visualize the correlation matrix
528
- plt.figure(figsize=(10, 8))
529
- sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
530
-
531
- # Set title for the plot
532
- plt.title('Correlation Matrix Heatmap')
533
-
534
- # Display the plot
535
- st.pyplot(plt)
536
-
537
- # Analysis Text for Correlation Heatmap
538
- st.write("""
539
- - **Price vs Discount/Cashback:**
540
- - The heatmap shows a strong negative correlation between price and both discount and cashback. Higher-priced hotels tend to offer fewer discounts and cashback incentives, suggesting that premium offerings rely on value rather than incentives.
541
-
542
- - **Price vs Rating:**
543
- - A weak positive correlation is observed between price and rating. Higher-priced hotels generally have slightly better ratings, although this relationship is not very strong.
544
-
545
- - **Reviews vs Rating:**
546
- - A moderate positive correlation exists between reviews and ratings. Hotels that attract more reviews tend to have better ratings, likely due to a larger customer base providing feedback.
547
-
548
- - **Reviews vs Price:**
549
- - A weak positive correlation between reviews and price suggests that well-reviewed hotels are often priced higher, likely due to their popularity and perceived value.
550
-
551
- - **Summary:**
552
- - The heatmap provides insights into the relationships between key variables in the dataset. Higher-priced hotels tend to offer fewer discounts and cashback offers but generally have better ratings and more reviews, indicating that customers are willing to pay a premium for a better experience.
553
- """)
554
-
555
- st.header("Overall Summary")
556
- st.write("""
557
- - Most properties are affordable, with lower prices, cashback, and discounts dominating the dataset.
558
- - Regional distribution shows states like Maharashtra and Madhya Pradesh having more hotels.
559
- - The data reflects a market focused on affordability and basic amenities, with regional and category-specific variations.
560
- - Cancellations and reviews provide further insights into customer behavior, while skewed distributions highlight potential outliers and trends in pricing and service offerings.
561
- """)
562
-
563
- st.header("Why Right-Skewed Trends Are Normal, Not Outliers")
564
- st.write("""
565
- - Right-skewed distributions for price, cashback, discounts, cancellations, reviews, and amenities are normal trends in the market.
566
- - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
567
- - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
568
- """)
569
- st.write("""
570
- Since no outliers were detected, we can proceed with model training and selection.
571
- With clean data, we can now focus on choosing the best algorithm, tuning hyperparameters, and evaluating model performance.
572
- """)
573
 
 
 
574
 
575
- else:
576
- st.warning("No dataset found in session state. Please load the dataset into `st.session_state['data']`.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  st.success("Dataset successfully loaded from session state!")
100
 
101
  st.subheader("Univariate Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ # Rating and Review Text Distribution
104
+ st.subheader("Rating and Review Text Distribution")
105
+ fig, axs = plt.subplots(1, 2, figsize=(16, 6))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ data["rating"].value_counts().plot(kind='pie', title='Distribution of Ratings', autopct='%1.1f%%', shadow=True, startangle=45, textprops={'size': 'x-large'}, ax=axs[0])
108
+ axs[0].set_title("Distribution of Ratings")
109
 
110
+ data['review_text'].value_counts().plot(kind='pie', title='Distribution of Review Text', autopct='%1.1f%%', shadow=True, startangle=45, textprops={'size': 'x-large'}, ax=axs[1])
111
+ axs[1].set_title("Distribution of Review Text")
112
+
113
+ st.pyplot(fig)
114
+
115
+ # Hotel Star Insights
116
+ st.write("""
117
+ **Insight:**
118
+ - Majority of hotels in this data are 3-star hotels.
119
+ - Frequency of 4-star and 5-star hotels are also moderately good.
120
+ - 1-star and 2-star hotels are lower in frequency.
121
+ """)
122
+
123
+ # Price, Cashback, and Discount Distribution
124
+ st.subheader("Price, Cashback, and Discount Distribution")
125
+ fig, axs = plt.subplots(1, 3, figsize=(16, 6))
126
+
127
+ sns.histplot(data=data, x='price', color='green', kde=True, ax=axs[0])
128
+ axs[0].set_title("Count based on price")
129
+ axs[0].set_xlabel('Price')
130
+ axs[0].set_ylabel('Number of People')
131
+
132
+ sns.histplot(data=data, x='cashback', color='violet', kde=True, ax=axs[1])
133
+ axs[1].set_title("Count based on cashback")
134
+ axs[1].set_xlabel('Cashback')
135
+ axs[1].set_ylabel('Number of People')
136
+
137
+ sns.histplot(data=data, x='discount', color='orange', kde=True, ax=axs[2])
138
+ axs[2].set_title("Count based on discount")
139
+ axs[2].set_xlabel('Discount')
140
+
141
+ st.pyplot(fig)
142
+ # Histogram Insights
143
+ st.subheader("Plot-wise Analysis of Histograms")
144
+ st.write("""
145
+ **Price Distribution Insight:**
146
+ - The histogram is right-skewed, showing most properties are in the lower price range.
147
+ - A long tail indicates the presence of a few very expensive properties.
148
+
149
+ **Cashback Distribution Insight:**
150
+ - The histogram is right-skewed, with the majority of properties offering lower cashback amounts.
151
+ - Only a small number of properties provide higher cashback.
152
+
153
+ **Discount Distribution Insight:**
154
+ - The histogram is right-skewed, indicating that most properties offer lower discount percentages.
155
+ - A few properties stand out with higher discounts.
156
+
157
+ **Summary:**
158
+ The data suggest that Agoda properties are generally affordable, with lower cashback and discount offers being common. Further statistical analysis could help uncover more detailed insights.
159
+ """)
160
+
161
+ # Cancellation and State Distribution
162
+ st.subheader("Cancellation and State Distribution")
163
+ fig, axs = plt.subplots(1, 2, figsize=(16, 6))
164
+
165
+ data["cancellation"].value_counts().plot(kind='bar', title='Distribution of Cancellation', color='red', ax=axs[0])
166
+ axs[0].set_title("Distribution of Cancellation")
167
+ axs[0].set_xlabel('Cancellation')
168
+ axs[0].set_ylabel('Number of Hotels')
169
+
170
+ data["state"].value_counts().plot(kind='bar', title='Distribution of State', color='black', ax=axs[1])
171
+ axs[1].set_title("Distribution of State")
172
+ axs[1].set_xlabel('State')
173
+ axs[1].set_ylabel('Number of Hotels')
174
+
175
+ st.pyplot(fig)
176
+ # Bar Chart Insights
177
+ st.subheader("Plot Wise Analysis of Bar Charts")
178
+ st.write("""
179
+ **Cancellations:**
180
+ - Most cancellations fall under category "1," indicating they occur within specific conditions or timeframes.
181
+
182
+ **State Distribution:**
183
+ - "Maharashtra" has the highest number of hotels, followed by "Madhya Pradesh."
184
+ - Other states like Gujarat, Karnataka, and Kerala also have notable hotel counts.
185
+ - The distribution is uneven, with some states having significantly more hotels.
186
+
187
+ **Summary:**
188
+ The charts highlight cancellation trends and the regional hotel distribution in India.
189
+ """)
190
+
191
+ # Category and Reviews Distribution
192
+ st.subheader("Category and Reviews Distribution")
193
+ fig, axs = plt.subplots(1, 2, figsize=(16, 6))
194
+
195
+ colors = sns.color_palette('Set2', n_colors=len(data["category"].value_counts()))
196
+ data["category"].value_counts().plot(kind='bar', ax=axs[0], color=colors)
197
+ axs[0].set_title("Distribution of Category")
198
+ axs[0].set_xlabel('Category')
199
+ axs[0].set_ylabel('Number of Hotels')
200
+
201
+ sns.histplot(data=data, x='reviews', color='violet', kde=True, ax=axs[1])
202
+ axs[1].set_title("Count based on Reviews")
203
+ axs[1].set_xlabel('Reviews')
204
+ axs[1].set_ylabel('Number of Reviews')
205
+
206
+ st.pyplot(fig)
207
+ # Hotel Categories and Reviews Insights
208
+ st.subheader("Plot Wise Analysis of Hotel Categories and Reviews")
209
+ st.write("""
210
+ **Category Distribution:**
211
+ - The histogram shows "Low Budget" hotels are the most common, followed by "Budget Hotels," while "Luxury Hotels" are the least common.
212
+
213
+ **Review Count Distribution:**
214
+ - The histogram is right-skewed, with most hotels having a low number of reviews.
215
+ - A few hotels have a very high number of reviews, evident from the long tail.
216
+
217
+ **Summary:**
218
+ The data indicates a higher concentration of low-budget hotels and relatively low review counts for most hotels.
219
+ """)
220
+
221
+ # Top 10 Amenities
222
+ st.subheader("Top 10 Amenities")
223
+ amenity_counts = data['free_services'].str.split(',').explode().str.strip().value_counts().reset_index()
224
+ amenity_counts.columns = ['Amenity', 'Count']
225
+
226
+ fig, ax = plt.subplots(figsize=(10, 6))
227
+ sns.barplot(x='Count', y='Amenity', data=amenity_counts.head(10), palette='viridis', ax=ax)
228
+ ax.set_title('Top 10 Amenities')
229
+ ax.set_xlabel('Number of Hotels Offering')
230
+ ax.set_ylabel('Amenity')
231
+
232
+ st.pyplot(fig)
233
+ # Top Amenities Insights
234
+ st.subheader("Plot Wise Analysis of Top Amenities")
235
+ st.write("""
236
+ **Common Amenities:**
237
+ - Complimentary Parking is the most frequently offered amenity.
238
+ - Basic Toiletries and Hair Dryers are also widely available.
239
+
240
+ **Less Common Amenities:**
241
+ - Fitness Center Access, Welcome Drinks, and Turndown Service are less common.
242
+ - Shoe Shine Service is the least frequently offered amenity.
243
+
244
+ **Summary:**
245
+ Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
246
+ """)
247
+
248
+ st.title("Bivariate Analysis")
249
+
250
+ # Price vs Rating scatter plot
251
+ st.subheader("Price vs Rating")
252
+ fig, ax = plt.subplots(figsize=(7, 5))
253
+ sns.scatterplot(x='rating', y='price', data=data, color='orange')
254
+ ax.set_title('Price vs Rating')
255
+ ax.set_xlabel('Rating')
256
+ ax.set_ylabel('Price')
257
+ st.pyplot(fig)
258
+
259
+ # Price vs Rating
260
+ st.subheader("Price vs Rating:")
261
+ st.write("""
262
+ - **Analysis:**
263
+ - Higher-priced hotels slightly tend to have better ratings, but ratings vary widely across price points.
264
+ - Hotels at various price points exhibit a large spread in ratings, meaning factors other than price (such as customer experience or amenities) contribute significantly to the rating.
265
+ """)
266
+
267
+ # Price vs Discount scatter plot
268
+ st.subheader("Price vs Discount")
269
+ fig, ax = plt.subplots(figsize=(7, 5))
270
+ sns.scatterplot(x='discount', y='price', data=data, color='green')
271
+ ax.set_title('Price vs Discount')
272
+ ax.set_xlabel('Discount')
273
+ ax.set_ylabel('Price')
274
+ st.pyplot(fig)
275
+ # Price vs Discount
276
+ st.subheader("Price vs Discount:")
277
+ st.write("""
278
+ - **Analysis:**
279
+ - Some high-priced hotels still provide discounts due to promotions or special deals.
280
+ - This observation suggests that while premium hotels may not always need to offer discounts to attract customers, they occasionally use them as a marketing strategy or for seasonal promotions.
281
+ """)
282
+
283
+ # Price vs Cashback scatter plot
284
+ st.subheader("Price vs Cashback")
285
+ fig, ax = plt.subplots(figsize=(7, 5))
286
+ sns.scatterplot(x='cashback', y='price', data=data, color='blue')
287
+ ax.set_title('Price vs Cashback')
288
+ ax.set_xlabel('Cashback')
289
+ ax.set_ylabel('Price')
290
+ st.pyplot(fig)
291
+
292
+ # Price vs Cashback
293
+ st.subheader("Price vs Cashback:")
294
+ st.write("""
295
+ - **Analysis:**
296
+ - Exceptions exist due to promotional campaigns.
297
+ - While higher-priced hotels generally offer fewer cashback incentives, some may offer cashback due to specific promotional campaigns aimed at increasing sales volume or attracting customers in a competitive market.
298
+ """)
299
+
300
+ # Price vs Category bar plot
301
+ st.subheader("Price vs Category")
302
+ fig, ax = plt.subplots(figsize=(7, 5))
303
+ sns.barplot(x='category', y='price', data=data, palette='Set2')
304
+ ax.set_title('Price vs Category')
305
+ ax.set_xlabel('Category')
306
+ ax.set_ylabel('Price')
307
+ st.pyplot(fig)
308
+ # Price vs Category
309
+ st.subheader("Price vs Category:")
310
+ st.write("""
311
+ - **Analysis:**
312
+ - "Luxury" hotels have the highest prices, followed by "Premium" and "Free & Easy."
313
+ - "Low Budget" and "Budget" hotels occupy the lower price range, showing that the category directly influences pricing strategy.
314
+ - Categories like "Luxury" and "Premium" aim to target a specific market willing to pay more for superior quality and services, while "Budget" and "Low Budget" cater to a price-sensitive segment.
315
+ """)
316
+ # Summary of Scatter Plot Analysis
317
+ st.subheader("Summary of Scatter Plot Analysis:")
318
+ st.write("""
319
+ - **Price vs Rating:** Higher-priced hotels generally offer better ratings, but the ratings vary widely across price points, indicating that factors such as service quality and amenities matter significantly.
320
+ - **Price vs Discount:** High-priced hotels may still offer discounts due to seasonal promotions or special offers.
321
+ - **Price vs Cashback:** Although high-priced hotels generally offer fewer cashback incentives, there are exceptions driven by promotional campaigns.
322
+ - **Price vs Category:** "Luxury" hotels are the most expensive, followed by "Premium" and "Free & Easy" categories. On the other hand, "Low Budget" and "Budget" hotels have lower prices.
323
+ - **Overall Insight:** The scatter plots reveal trends where higher-priced hotels tend to offer better ratings but fewer discounts and cashback incentives, while lower-priced categories tend to provide more promotional benefits such as discounts and cashback.
324
+ """)
325
+
326
+ # Rating vs Category bar plot
327
+ st.subheader("Rating vs Category")
328
+ fig, ax = plt.subplots(figsize=(7, 5))
329
+ sns.barplot(x='category', y='rating', data=data, palette='Set1')
330
+ ax.set_title('Rating vs Category')
331
+ ax.set_xlabel('Category')
332
+ ax.set_ylabel('Rating')
333
+ st.pyplot(fig)
334
+ # Rating vs Category
335
+ st.subheader("Rating vs Category:")
336
+ st.write("""
337
+ - **Analysis:**
338
+ - "Luxury" hotels lead in average ratings, followed by "Premium" hotels.
339
+ - "Budget" and "Low Budget" categories show lower average ratings, indicating that these hotels may focus on price rather than offering premium services or experiences.
340
+ - The disparity in ratings shows that customers tend to have higher expectations for luxury and premium accommodations, which are reflected in the ratings.
341
+ """)
342
+
343
+ # Discount vs Category box plot
344
+ st.subheader("Discount vs Category")
345
+ fig, ax = plt.subplots(figsize=(7, 5))
346
+ sns.boxplot(x='category', y='discount', data=data, palette='Set2')
347
+ ax.set_title('Discount vs Category')
348
+ ax.set_xlabel('Category')
349
+ ax.set_ylabel('Discount')
350
+ st.pyplot(fig)
351
+ # Discount vs Category
352
+ st.subheader("Discount vs Category:")
353
+ st.write("""
354
+ - **Analysis:**
355
+ - "Low Budget" hotels offer the highest discounts, while "Luxury" hotels provide the least discounts.
356
+ - This suggests that budget-friendly hotels use discounts as a key strategy to attract price-sensitive customers, whereas luxury hotels focus on providing premium experiences without relying on price reductions.
357
+ - The discount strategy varies by category, with lower-priced categories incentivizing customers with discounts to stay competitive.
358
+ """)
359
+ # Cashback vs Category violin plot
360
+ st.subheader("Cashback vs Category")
361
+ fig, ax = plt.subplots(figsize=(7, 5))
362
+ sns.violinplot(x='category', y='cashback', data=data, palette='Set3')
363
+ ax.set_title('Cashback vs Category')
364
+ ax.set_xlabel('Category')
365
+ ax.set_ylabel('Cashback')
366
+ st.pyplot(fig)
367
+ # Cashback vs Category
368
+ st.subheader("Cashback vs Category:")
369
+ st.write("""
370
+ - **Analysis:**
371
+ - Higher cashback offers are more common in "Low Budget" hotels, as these hotels rely on cashback incentives to attract customers looking for value deals.
372
+ - Luxury hotels rarely provide cashback, as their target market is less likely to be motivated by such offers.
373
+ - The trend highlights the different strategies employed by each category: budget options often provide financial incentives like cashback to drive bookings, while luxury options focus on premium services.
374
+ """)
375
+
376
+ # Reviews vs Category count plot
377
+ st.subheader("Reviews vs Category")
378
+ fig, ax = plt.subplots(figsize=(7, 5))
379
+ sns.countplot(x='category', data=data, palette='Set1')
380
+ ax.set_title('Reviews vs Category (Count)')
381
+ ax.set_xlabel('Category')
382
+ ax.set_ylabel('Count of Reviews')
383
+ st.pyplot(fig)
384
+ # Reviews vs Category
385
+ st.subheader("Reviews vs Category:")
386
+ st.write("""
387
+ - **Analysis:**
388
+ - "Luxury" hotels attract the most reviews, indicating that higher-quality accommodations often receive more feedback from customers.
389
+ - "Budget" and "Low Budget" hotels tend to have fewer reviews, which may be due to their more straightforward offerings and smaller customer base.
390
+ - This trend suggests that customers who opt for luxury hotels are more likely to share their experiences, whereas budget options may attract fewer repeat customers or have less word-of-mouth influence.
391
+ """)
392
+ # Summary of Bar and Box Plot Analysis
393
+ st.subheader("Summary of Bar and Box Plot Analysis:")
394
+ st.write("""
395
+ - **Rating vs Category:** "Luxury" and "Premium" hotels have higher ratings on average, while "Budget" and "Low Budget" hotels show lower ratings.
396
+ - **Discount vs Category:** Budget hotels, especially "Low Budget," offer higher discounts, while luxury hotels offer fewer discounts, relying more on their value proposition.
397
+ - **Cashback vs Category:** "Low Budget" hotels offer higher cashback incentives, while luxury hotels rarely provide cashback, highlighting the pricing strategies in different categories.
398
+ - **Reviews vs Category:** "Luxury" hotels attract the most reviews, while "Budget" and "Low Budget" hotels attract fewer.
399
+ - **Overall Insight:** Bar and box plots reveal that higher-rated and more reviewed hotels tend to offer fewer discounts or cashback, focusing on customer experience, while budget categories focus on providing customer incentives to compete in the market.
400
+ """)
401
+ # Regional price analysis by state
402
+ st.subheader("Price by State")
403
+ fig, ax = plt.subplots(figsize=(16, 6))
404
+ sns.barplot(data=data, x='state', y='price', ax=ax, color='green')
405
+ ax.set_title('Price by State')
406
+ ax.tick_params(axis='x', rotation=90)
407
+ sns.set_palette('magma')
408
+ plt.tight_layout()
409
+ st.pyplot(fig)
410
+ # Hotel Prices Across Indian States
411
+ st.subheader("Hotel Prices Across Indian States:")
412
+ st.write("""
413
+ - **Analysis:**
414
+ - Hotel prices vary significantly across Indian states, reflecting regional differences in demand and supply.
415
+ - States with popular tourist destinations, such as Goa and Rajasthan, tend to show higher hotel prices, as they attract more visitors and have a higher demand for accommodations.
416
+ - Conversely, states with less tourism or lower demand may exhibit more affordable pricing, catering to the local population or budget travelers.
417
+ - Price differences also reflect factors such as local economic conditions, infrastructure, and tourism policies in each state.
418
+ """)
419
+
420
+ # Regional category count by state
421
+ st.subheader("Category by State")
422
+ fig, ax = plt.subplots(figsize=(16, 6))
423
+ sns.countplot(data=data, x='state', hue='category', ax=ax, palette='Set1')
424
+ ax.set_title('Category by State')
425
+ ax.tick_params(axis='x', rotation=90)
426
+ plt.tight_layout()
427
+ st.pyplot(fig)
428
+ # Hotel Categories by State
429
+ st.subheader("Hotel Categories by State:")
430
+ st.write("""
431
+ - **Analysis:**
432
+ - States with a higher concentration of "Low Budget" and "Budget" hotels cater primarily to cost-conscious travelers, offering affordable accommodations for a wide range of customers.
433
+ - States with more "Luxury" hotels are likely to be major tourist hubs, such as Delhi, Mumbai, and Kerala, or regions that cater to premium audiences, offering high-end services for affluent customers.
434
+ - These states may also focus on attracting international tourists or business travelers who prefer premium amenities and luxury experiences.
435
+ """)
436
+
437
+ # Summary of Regional Price and Category Trends
438
+ st.subheader("Summary of Regional Price and Category Trends:")
439
+ st.write("""
440
+ - **Hotel Prices Across Indian States:** Prices vary significantly depending on the state, with tourist-heavy regions showing higher prices due to greater demand.
441
+ - **Hotel Categories by State:** States with more budget hotels focus on catering to price-sensitive travelers, while states with luxury hotels cater to premium audiences, often in tourist hotspots.
442
+ - **Overall Insight:** Regional trends indicate diverse pricing and category distributions, influenced by tourism, regional economics, and state-specific factors that shape hotel offerings across the country.
443
+ """)
444
+
445
+ st.title("Multivariate Analysis of Hotel Data")
446
+
447
+ # Create a subset of the data for the analysis
448
+ subset_data = data[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
449
+
450
+ # Section 1: Price vs. Reviews by Category
451
+ st.header("Price vs. Reviews by Category")
452
+ fig1 = sns.catplot(data=data, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
453
+ fig1.set_axis_labels("Reviews", "Price")
454
+ fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
455
+ st.pyplot(fig1)
456
+ # Analysis Text for Price vs. Reviews by Category
457
+ st.write("""
458
+ - **Price Variation within Categories:**
459
+ - Wide price ranges exist within each category, with "Low Budget" hotels featuring both low- and high-priced options.
460
+ - This shows that pricing within categories isn't always uniform and may depend on other factors like location, amenities, and hotel size.
461
+
462
+ - **Price and Reviews Relationship:**
463
+ - There's a slight tendency for hotels with more reviews to have higher prices, possibly due to the influence of popularity, better marketing efforts, or higher quality services.
464
+
465
+ - **Summary:**
466
+ - The stripplot reveals a weak positive correlation between price and reviews, indicating that well-reviewed hotels tend to have higher prices.
467
+ """)
468
+
469
+ # Section 2: Price vs. Discount by Category
470
+ st.header("Price vs. Discount by Category")
471
+ fig2 = sns.catplot(data=data, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
472
+ fig2.set_axis_labels("Discount", "Price")
473
+ fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
474
+ st.pyplot(fig2)
475
+ # Analysis Text for Price vs. Discount by Category
476
+ st.write("""
477
+ - **Price and Discount Relationship:**
478
+ - The stripplot clearly shows that as hotel prices increase, discounts tend to decrease. This confirms a negative correlation between price and discount.
479
+
480
+ - **Category-Specific Trends:**
481
+ - "Low Budget" and "Budget" hotels offer much higher discounts compared to "Premium" and "Luxury" hotels, which typically offer fewer or smaller discounts.
482
+ - This trend highlights that budget-conscious categories use higher discounts to attract customers, whereas premium and luxury categories rely on factors other than discounts (e.g., quality, exclusivity) to appeal to their clientele.
483
 
484
+ - **Summary:**
485
+ - The plot confirms that lower-priced hotels use higher discounts to attract customers, while premium and luxury hotels maintain lower discount rates, aligning with typical market behavior.
486
+ """)
487
+
488
+
489
+ # Section 3: Price vs Cashback and Rating by Category (Stripplot)
490
+ st.header("Price vs Cashback and Rating by Category")
491
+ fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
492
+
493
+ sns.stripplot(data=data, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
494
+ axes2[0].set_title('Price vs Cashback by Category')
495
+
496
+ sns.stripplot(data=data, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
497
+ axes2[1].set_title('Price vs Rating by Category')
498
+
499
+ st.pyplot(fig3)
500
+
501
+
502
+ # Analysis Text for Price vs Cashback by Category
503
+ st.write("""
504
+ - **Price vs Cashback by Category:**
505
+ - The plot shows that cashback incentives tend to decrease as hotel prices increase. However, variations exist due to promotional offers that may affect cashback amounts.
506
+ - Lower-priced categories, such as "Budget" and "Low Budget," offer higher cashback incentives to attract price-sensitive customers.
507
+
508
+ - **Summary:**
509
+ - The stripplot reveals a clear trend where lower-priced hotels use cashback as an incentive to boost bookings, whereas higher-priced hotels focus on other value propositions (e.g., premium services).
510
+ """)
511
+ # Analysis Text for Price vs Rating by Category
512
+ st.write("""
513
+ - **Price vs Rating by Category:**
514
+ - Higher-priced categories like "Premium" and "Luxury" tend to have better ratings, indicating that customers perceive these hotels as providing superior value.
515
+ - Interestingly, some lower-priced hotels achieve high ratings, suggesting that other factors such as service quality and customer experience may contribute to higher satisfaction despite the lower price point.
516
+
517
+ - **Summary:**
518
+ - The stripplot shows that while higher-priced hotels emphasize quality to achieve better ratings, lower-priced hotels still manage to deliver satisfactory experiences for customers through factors like service quality.
519
+ """)
520
+ # Section 4: Correlation Heatmap
521
+ st.header("Correlation Matrix Heatmap")
522
+ numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
523
+
524
+ # Compute the correlation matrix
525
+ correlation_matrix = data[numeric_columns].corr()
526
+
527
+ # Create a heatmap to visualize the correlation matrix
528
+ plt.figure(figsize=(10, 8))
529
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, vmin=-1, vmax=1)
530
+
531
+ # Set title for the plot
532
+ plt.title('Correlation Matrix Heatmap')
533
+
534
+ # Display the plot
535
+ st.pyplot(plt)
536
+
537
+ # Analysis Text for Correlation Heatmap
538
+ st.write("""
539
+ - **Price vs Discount/Cashback:**
540
+ - The heatmap shows a strong negative correlation between price and both discount and cashback. Higher-priced hotels tend to offer fewer discounts and cashback incentives, suggesting that premium offerings rely on value rather than incentives.
541
+
542
+ - **Price vs Rating:**
543
+ - A weak positive correlation is observed between price and rating. Higher-priced hotels generally have slightly better ratings, although this relationship is not very strong.
544
+
545
+ - **Reviews vs Rating:**
546
+ - A moderate positive correlation exists between reviews and ratings. Hotels that attract more reviews tend to have better ratings, likely due to a larger customer base providing feedback.
547
+
548
+ - **Reviews vs Price:**
549
+ - A weak positive correlation between reviews and price suggests that well-reviewed hotels are often priced higher, likely due to their popularity and perceived value.
550
+
551
+ - **Summary:**
552
+ - The heatmap provides insights into the relationships between key variables in the dataset. Higher-priced hotels tend to offer fewer discounts and cashback offers but generally have better ratings and more reviews, indicating that customers are willing to pay a premium for a better experience.
553
+ """)
554
+
555
+ st.header("Overall Summary")
556
+ st.write("""
557
+ - Most properties are affordable, with lower prices, cashback, and discounts dominating the dataset.
558
+ - Regional distribution shows states like Maharashtra and Madhya Pradesh having more hotels.
559
+ - The data reflects a market focused on affordability and basic amenities, with regional and category-specific variations.
560
+ - Cancellations and reviews provide further insights into customer behavior, while skewed distributions highlight potential outliers and trends in pricing and service offerings.
561
+ """)
562
+
563
+ st.header("Why Right-Skewed Trends Are Normal, Not Outliers")
564
+ st.write("""
565
+ - Right-skewed distributions for price, cashback, discounts, cancellations, reviews, and amenities are normal trends in the market.
566
+ - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
567
+ - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
568
+ """)
569
+ st.write("""
570
+ Since no outliers were detected, we can proceed with model training and selection.
571
+ With clean data, we can now focus on choosing the best algorithm, tuning hyperparameters, and evaluating model performance.
572
+ """)
573
+
574
+ else:
575
+ st.warning("No dataset found in session state. Please load the dataset into `st.session_state['data']`.")
576
+