Spaces:
Sleeping
Sleeping
Update pages/4_EDA( Exploratory Data Analysis).py
Browse files
pages/4_EDA( Exploratory Data Analysis).py
CHANGED
|
@@ -180,14 +180,14 @@ if data is not None:
|
|
| 180 |
**Summary:**
|
| 181 |
Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
|
| 182 |
""")
|
| 183 |
-
|
| 184 |
# Streamlit app
|
| 185 |
st.title("Bivariate Analysis")
|
| 186 |
|
| 187 |
# Price vs Rating scatter plot
|
| 188 |
st.subheader("Price vs Rating")
|
| 189 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 190 |
-
sns.scatterplot(x='rating', y='price', data=
|
| 191 |
ax.set_title('Price vs Rating')
|
| 192 |
ax.set_xlabel('Rating')
|
| 193 |
ax.set_ylabel('Price')
|
|
@@ -205,7 +205,7 @@ if data is not None:
|
|
| 205 |
# Price vs Discount scatter plot
|
| 206 |
st.subheader("Price vs Discount")
|
| 207 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 208 |
-
sns.scatterplot(x='discount', y='price', data=
|
| 209 |
ax.set_title('Price vs Discount')
|
| 210 |
ax.set_xlabel('Discount')
|
| 211 |
ax.set_ylabel('Price')
|
|
@@ -214,7 +214,7 @@ if data is not None:
|
|
| 214 |
# Price vs Cashback scatter plot
|
| 215 |
st.subheader("Price vs Cashback")
|
| 216 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 217 |
-
sns.scatterplot(x='cashback', y='price', data=
|
| 218 |
ax.set_title('Price vs Cashback')
|
| 219 |
ax.set_xlabel('Cashback')
|
| 220 |
ax.set_ylabel('Price')
|
|
@@ -223,7 +223,7 @@ if data is not None:
|
|
| 223 |
# Price vs Category bar plot
|
| 224 |
st.subheader("Price vs Category")
|
| 225 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 226 |
-
sns.barplot(x='category', y='price', data=
|
| 227 |
ax.set_title('Price vs Category')
|
| 228 |
ax.set_xlabel('Category')
|
| 229 |
ax.set_ylabel('Price')
|
|
@@ -232,7 +232,7 @@ if data is not None:
|
|
| 232 |
# Rating vs Category bar plot
|
| 233 |
st.subheader("Rating vs Category")
|
| 234 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 235 |
-
sns.barplot(x='category', y='rating', data=
|
| 236 |
ax.set_title('Rating vs Category')
|
| 237 |
ax.set_xlabel('Category')
|
| 238 |
ax.set_ylabel('Rating')
|
|
@@ -241,7 +241,7 @@ if data is not None:
|
|
| 241 |
# Discount vs Category box plot
|
| 242 |
st.subheader("Discount vs Category")
|
| 243 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 244 |
-
sns.boxplot(x='category', y='discount', data=
|
| 245 |
ax.set_title('Discount vs Category')
|
| 246 |
ax.set_xlabel('Category')
|
| 247 |
ax.set_ylabel('Discount')
|
|
@@ -250,7 +250,7 @@ if data is not None:
|
|
| 250 |
# Cashback vs Category violin plot
|
| 251 |
st.subheader("Cashback vs Category")
|
| 252 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 253 |
-
sns.violinplot(x='category', y='cashback', data=
|
| 254 |
ax.set_title('Cashback vs Category')
|
| 255 |
ax.set_xlabel('Category')
|
| 256 |
ax.set_ylabel('Cashback')
|
|
@@ -259,7 +259,7 @@ if data is not None:
|
|
| 259 |
# Reviews vs Category count plot
|
| 260 |
st.subheader("Reviews vs Category")
|
| 261 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 262 |
-
sns.countplot(x='category', data=
|
| 263 |
ax.set_title('Reviews vs Category (Count)')
|
| 264 |
ax.set_xlabel('Category')
|
| 265 |
ax.set_ylabel('Count of Reviews')
|
|
@@ -268,7 +268,7 @@ if data is not None:
|
|
| 268 |
# Regional price analysis by state
|
| 269 |
st.subheader("Price by State")
|
| 270 |
fig, ax = plt.subplots(figsize=(16, 6))
|
| 271 |
-
sns.barplot(data=
|
| 272 |
ax.set_title('Price by State')
|
| 273 |
ax.tick_params(axis='x', rotation=90)
|
| 274 |
sns.set_palette('magma')
|
|
@@ -278,7 +278,7 @@ if data is not None:
|
|
| 278 |
# Regional category count by state
|
| 279 |
st.subheader("Category by State")
|
| 280 |
fig, ax = plt.subplots(figsize=(16, 6))
|
| 281 |
-
sns.countplot(data=
|
| 282 |
ax.set_title('Category by State')
|
| 283 |
ax.tick_params(axis='x', rotation=90)
|
| 284 |
plt.tight_layout()
|
|
@@ -290,23 +290,22 @@ if data is not None:
|
|
| 290 |
- **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
|
| 291 |
- **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
|
| 292 |
""")
|
| 293 |
-
|
| 294 |
-
|
| 295 |
st.title("Multivariate Analysis of Hotel Data")
|
| 296 |
|
| 297 |
# Create a subset of the data for the analysis
|
| 298 |
-
subset_data =
|
| 299 |
|
| 300 |
# Section 1: Price vs. Reviews by Category
|
| 301 |
st.header("Price vs. Reviews by Category")
|
| 302 |
-
fig1 = sns.catplot(data=
|
| 303 |
fig1.set_axis_labels("Reviews", "Price")
|
| 304 |
fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
|
| 305 |
st.pyplot(fig1)
|
| 306 |
|
| 307 |
# Section 2: Price vs. Discount by Category
|
| 308 |
st.header("Price vs. Discount by Category")
|
| 309 |
-
fig2 = sns.catplot(data=
|
| 310 |
fig2.set_axis_labels("Discount", "Price")
|
| 311 |
fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
|
| 312 |
st.pyplot(fig2)
|
|
@@ -315,14 +314,14 @@ if data is not None:
|
|
| 315 |
st.header("Price vs Cashback and Rating by Category")
|
| 316 |
fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
|
| 317 |
|
| 318 |
-
sns.stripplot(data=
|
| 319 |
axes2[0].set_title('Price vs Cashback by Category')
|
| 320 |
|
| 321 |
-
sns.stripplot(data=
|
| 322 |
axes2[1].set_title('Price vs Rating by Category')
|
| 323 |
|
| 324 |
st.pyplot(fig3)
|
| 325 |
-
|
| 326 |
# Insights and analysis
|
| 327 |
st.header("Plot-Wise Analysis Insights")
|
| 328 |
|
|
@@ -350,7 +349,7 @@ if data is not None:
|
|
| 350 |
numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
|
| 351 |
|
| 352 |
# Compute the correlation matrix
|
| 353 |
-
correlation_matrix =
|
| 354 |
|
| 355 |
# Create a heatmap to visualize the correlation matrix
|
| 356 |
plt.figure(figsize=(10, 8))
|
|
@@ -383,6 +382,10 @@ if data is not None:
|
|
| 383 |
- These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
|
| 384 |
- The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
|
| 385 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
else:
|
|
|
|
| 180 |
**Summary:**
|
| 181 |
Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
|
| 182 |
""")
|
| 183 |
+
|
| 184 |
# Streamlit app
|
| 185 |
st.title("Bivariate Analysis")
|
| 186 |
|
| 187 |
# Price vs Rating scatter plot
|
| 188 |
st.subheader("Price vs Rating")
|
| 189 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 190 |
+
sns.scatterplot(x='rating', y='price', data=data, color='orange')
|
| 191 |
ax.set_title('Price vs Rating')
|
| 192 |
ax.set_xlabel('Rating')
|
| 193 |
ax.set_ylabel('Price')
|
|
|
|
| 205 |
# Price vs Discount scatter plot
|
| 206 |
st.subheader("Price vs Discount")
|
| 207 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 208 |
+
sns.scatterplot(x='discount', y='price', data=data, color='green')
|
| 209 |
ax.set_title('Price vs Discount')
|
| 210 |
ax.set_xlabel('Discount')
|
| 211 |
ax.set_ylabel('Price')
|
|
|
|
| 214 |
# Price vs Cashback scatter plot
|
| 215 |
st.subheader("Price vs Cashback")
|
| 216 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 217 |
+
sns.scatterplot(x='cashback', y='price', data=data, color='blue')
|
| 218 |
ax.set_title('Price vs Cashback')
|
| 219 |
ax.set_xlabel('Cashback')
|
| 220 |
ax.set_ylabel('Price')
|
|
|
|
| 223 |
# Price vs Category bar plot
|
| 224 |
st.subheader("Price vs Category")
|
| 225 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 226 |
+
sns.barplot(x='category', y='price', data=data, palette='Set2')
|
| 227 |
ax.set_title('Price vs Category')
|
| 228 |
ax.set_xlabel('Category')
|
| 229 |
ax.set_ylabel('Price')
|
|
|
|
| 232 |
# Rating vs Category bar plot
|
| 233 |
st.subheader("Rating vs Category")
|
| 234 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 235 |
+
sns.barplot(x='category', y='rating', data=data, palette='Set1')
|
| 236 |
ax.set_title('Rating vs Category')
|
| 237 |
ax.set_xlabel('Category')
|
| 238 |
ax.set_ylabel('Rating')
|
|
|
|
| 241 |
# Discount vs Category box plot
|
| 242 |
st.subheader("Discount vs Category")
|
| 243 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 244 |
+
sns.boxplot(x='category', y='discount', data=data, palette='Set2')
|
| 245 |
ax.set_title('Discount vs Category')
|
| 246 |
ax.set_xlabel('Category')
|
| 247 |
ax.set_ylabel('Discount')
|
|
|
|
| 250 |
# Cashback vs Category violin plot
|
| 251 |
st.subheader("Cashback vs Category")
|
| 252 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 253 |
+
sns.violinplot(x='category', y='cashback', data=data, palette='Set3')
|
| 254 |
ax.set_title('Cashback vs Category')
|
| 255 |
ax.set_xlabel('Category')
|
| 256 |
ax.set_ylabel('Cashback')
|
|
|
|
| 259 |
# Reviews vs Category count plot
|
| 260 |
st.subheader("Reviews vs Category")
|
| 261 |
fig, ax = plt.subplots(figsize=(7, 5))
|
| 262 |
+
sns.countplot(x='category', data=data, palette='Set1')
|
| 263 |
ax.set_title('Reviews vs Category (Count)')
|
| 264 |
ax.set_xlabel('Category')
|
| 265 |
ax.set_ylabel('Count of Reviews')
|
|
|
|
| 268 |
# Regional price analysis by state
|
| 269 |
st.subheader("Price by State")
|
| 270 |
fig, ax = plt.subplots(figsize=(16, 6))
|
| 271 |
+
sns.barplot(data=data, x='state', y='price', ax=ax, color='green')
|
| 272 |
ax.set_title('Price by State')
|
| 273 |
ax.tick_params(axis='x', rotation=90)
|
| 274 |
sns.set_palette('magma')
|
|
|
|
| 278 |
# Regional category count by state
|
| 279 |
st.subheader("Category by State")
|
| 280 |
fig, ax = plt.subplots(figsize=(16, 6))
|
| 281 |
+
sns.countplot(data=data, x='state', hue='category', ax=ax, palette='Set1')
|
| 282 |
ax.set_title('Category by State')
|
| 283 |
ax.tick_params(axis='x', rotation=90)
|
| 284 |
plt.tight_layout()
|
|
|
|
| 290 |
- **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
|
| 291 |
- **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
|
| 292 |
""")
|
| 293 |
+
|
|
|
|
| 294 |
st.title("Multivariate Analysis of Hotel Data")
|
| 295 |
|
| 296 |
# Create a subset of the data for the analysis
|
| 297 |
+
subset_data = data[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
|
| 298 |
|
| 299 |
# Section 1: Price vs. Reviews by Category
|
| 300 |
st.header("Price vs. Reviews by Category")
|
| 301 |
+
fig1 = sns.catplot(data=data, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
|
| 302 |
fig1.set_axis_labels("Reviews", "Price")
|
| 303 |
fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
|
| 304 |
st.pyplot(fig1)
|
| 305 |
|
| 306 |
# Section 2: Price vs. Discount by Category
|
| 307 |
st.header("Price vs. Discount by Category")
|
| 308 |
+
fig2 = sns.catplot(data=data, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
|
| 309 |
fig2.set_axis_labels("Discount", "Price")
|
| 310 |
fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
|
| 311 |
st.pyplot(fig2)
|
|
|
|
| 314 |
st.header("Price vs Cashback and Rating by Category")
|
| 315 |
fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
|
| 316 |
|
| 317 |
+
sns.stripplot(data=data, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
|
| 318 |
axes2[0].set_title('Price vs Cashback by Category')
|
| 319 |
|
| 320 |
+
sns.stripplot(data=data, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
|
| 321 |
axes2[1].set_title('Price vs Rating by Category')
|
| 322 |
|
| 323 |
st.pyplot(fig3)
|
| 324 |
+
|
| 325 |
# Insights and analysis
|
| 326 |
st.header("Plot-Wise Analysis Insights")
|
| 327 |
|
|
|
|
| 349 |
numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
|
| 350 |
|
| 351 |
# Compute the correlation matrix
|
| 352 |
+
correlation_matrix = data[numeric_columns].corr()
|
| 353 |
|
| 354 |
# Create a heatmap to visualize the correlation matrix
|
| 355 |
plt.figure(figsize=(10, 8))
|
|
|
|
| 382 |
- These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
|
| 383 |
- The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
|
| 384 |
""")
|
| 385 |
+
st.write("""
|
| 386 |
+
Since no outliers were detected, we can proceed with model training and selection.
|
| 387 |
+
With clean data, we can now focus on choosing the best algorithm, tuning hyperparameters, and evaluating model performance.
|
| 388 |
+
""")
|
| 389 |
|
| 390 |
|
| 391 |
else:
|