Spaces:

Mpavan45
/

Hotel_Data_Analysis

Sleeping

App Files Files Community

Mpavan45 commited on Jan 9, 2025

Commit

557481d

verified ·

1 Parent(s): cc23e5b

Update pages/4_EDA( Exploratory Data Analysis).py

Browse files

Files changed (1) hide show

pages/4_EDA( Exploratory Data Analysis).py +23 -20

pages/4_EDA( Exploratory Data Analysis).py CHANGED Viewed

@@ -180,14 +180,14 @@ if data is not None:
         **Summary:**
         Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
     """)
     # Streamlit app
     st.title("Bivariate Analysis")
     # Price vs Rating scatter plot
     st.subheader("Price vs Rating")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.scatterplot(x='rating', y='price', data=agoda_df, color='orange')
     ax.set_title('Price vs Rating')
     ax.set_xlabel('Rating')
     ax.set_ylabel('Price')
@@ -205,7 +205,7 @@ if data is not None:
     # Price vs Discount scatter plot
     st.subheader("Price vs Discount")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.scatterplot(x='discount', y='price', data=agoda_df, color='green')
     ax.set_title('Price vs Discount')
     ax.set_xlabel('Discount')
     ax.set_ylabel('Price')
@@ -214,7 +214,7 @@ if data is not None:
     # Price vs Cashback scatter plot
     st.subheader("Price vs Cashback")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.scatterplot(x='cashback', y='price', data=agoda_df, color='blue')
     ax.set_title('Price vs Cashback')
     ax.set_xlabel('Cashback')
     ax.set_ylabel('Price')
@@ -223,7 +223,7 @@ if data is not None:
     # Price vs Category bar plot
     st.subheader("Price vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.barplot(x='category', y='price', data=agoda_df, palette='Set2')
     ax.set_title('Price vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Price')
@@ -232,7 +232,7 @@ if data is not None:
     # Rating vs Category bar plot
     st.subheader("Rating vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.barplot(x='category', y='rating', data=agoda_df, palette='Set1')
     ax.set_title('Rating vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Rating')
@@ -241,7 +241,7 @@ if data is not None:
     # Discount vs Category box plot
     st.subheader("Discount vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.boxplot(x='category', y='discount', data=agoda_df, palette='Set2')
     ax.set_title('Discount vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Discount')
@@ -250,7 +250,7 @@ if data is not None:
     # Cashback vs Category violin plot
     st.subheader("Cashback vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.violinplot(x='category', y='cashback', data=agoda_df, palette='Set3')
     ax.set_title('Cashback vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Cashback')
@@ -259,7 +259,7 @@ if data is not None:
     # Reviews vs Category count plot
     st.subheader("Reviews vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
-    sns.countplot(x='category', data=agoda_df, palette='Set1')
     ax.set_title('Reviews vs Category (Count)')
     ax.set_xlabel('Category')
     ax.set_ylabel('Count of Reviews')
@@ -268,7 +268,7 @@ if data is not None:
     # Regional price analysis by state
     st.subheader("Price by State")
     fig, ax = plt.subplots(figsize=(16, 6))
-    sns.barplot(data=agoda_df, x='state', y='price', ax=ax, color='green')
     ax.set_title('Price by State')
     ax.tick_params(axis='x', rotation=90)
     sns.set_palette('magma')
@@ -278,7 +278,7 @@ if data is not None:
     # Regional category count by state
     st.subheader("Category by State")
     fig, ax = plt.subplots(figsize=(16, 6))
-    sns.countplot(data=agoda_df, x='state', hue='category', ax=ax, palette='Set1')
     ax.set_title('Category by State')
     ax.tick_params(axis='x', rotation=90)
     plt.tight_layout()
@@ -290,23 +290,22 @@ if data is not None:
         - **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
         - **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
     """)
     st.title("Multivariate Analysis of Hotel Data")
     # Create a subset of the data for the analysis
-    subset_data = agoda_df[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
     # Section 1: Price vs. Reviews by Category
     st.header("Price vs. Reviews by Category")
-    fig1 = sns.catplot(data=agoda_df, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
     fig1.set_axis_labels("Reviews", "Price")
     fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
     st.pyplot(fig1)
     # Section 2: Price vs. Discount by Category
     st.header("Price vs. Discount by Category")
-    fig2 = sns.catplot(data=agoda_df, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
     fig2.set_axis_labels("Discount", "Price")
     fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
     st.pyplot(fig2)
@@ -315,14 +314,14 @@ if data is not None:
     st.header("Price vs Cashback and Rating by Category")
     fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
-    sns.stripplot(data=agoda_df, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
     axes2[0].set_title('Price vs Cashback by Category')
-    sns.stripplot(data=agoda_df, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
     axes2[1].set_title('Price vs Rating by Category')
     st.pyplot(fig3)
     # Insights and analysis
     st.header("Plot-Wise Analysis Insights")
@@ -350,7 +349,7 @@ if data is not None:
     numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
     # Compute the correlation matrix
-    correlation_matrix = agoda_df[numeric_columns].corr()
     # Create a heatmap to visualize the correlation matrix
     plt.figure(figsize=(10, 8))
@@ -383,6 +382,10 @@ if data is not None:
         - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
         - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
     """)
 else:

         **Summary:**
         Hotels tend to prioritize basic amenities like parking, toiletries, and hair dryers, while luxurious amenities are offered less frequently.
     """)
     # Streamlit app
     st.title("Bivariate Analysis")
     # Price vs Rating scatter plot
     st.subheader("Price vs Rating")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.scatterplot(x='rating', y='price', data=data, color='orange')
     ax.set_title('Price vs Rating')
     ax.set_xlabel('Rating')
     ax.set_ylabel('Price')
     # Price vs Discount scatter plot
     st.subheader("Price vs Discount")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.scatterplot(x='discount', y='price', data=data, color='green')
     ax.set_title('Price vs Discount')
     ax.set_xlabel('Discount')
     ax.set_ylabel('Price')
     # Price vs Cashback scatter plot
     st.subheader("Price vs Cashback")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.scatterplot(x='cashback', y='price', data=data, color='blue')
     ax.set_title('Price vs Cashback')
     ax.set_xlabel('Cashback')
     ax.set_ylabel('Price')
     # Price vs Category bar plot
     st.subheader("Price vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.barplot(x='category', y='price', data=data, palette='Set2')
     ax.set_title('Price vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Price')
     # Rating vs Category bar plot
     st.subheader("Rating vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.barplot(x='category', y='rating', data=data, palette='Set1')
     ax.set_title('Rating vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Rating')
     # Discount vs Category box plot
     st.subheader("Discount vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.boxplot(x='category', y='discount', data=data, palette='Set2')
     ax.set_title('Discount vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Discount')
     # Cashback vs Category violin plot
     st.subheader("Cashback vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.violinplot(x='category', y='cashback', data=data, palette='Set3')
     ax.set_title('Cashback vs Category')
     ax.set_xlabel('Category')
     ax.set_ylabel('Cashback')
     # Reviews vs Category count plot
     st.subheader("Reviews vs Category")
     fig, ax = plt.subplots(figsize=(7, 5))
+    sns.countplot(x='category', data=data, palette='Set1')
     ax.set_title('Reviews vs Category (Count)')
     ax.set_xlabel('Category')
     ax.set_ylabel('Count of Reviews')
     # Regional price analysis by state
     st.subheader("Price by State")
     fig, ax = plt.subplots(figsize=(16, 6))
+    sns.barplot(data=data, x='state', y='price', ax=ax, color='green')
     ax.set_title('Price by State')
     ax.tick_params(axis='x', rotation=90)
     sns.set_palette('magma')
     # Regional category count by state
     st.subheader("Category by State")
     fig, ax = plt.subplots(figsize=(16, 6))
+    sns.countplot(data=data, x='state', hue='category', ax=ax, palette='Set1')
     ax.set_title('Category by State')
     ax.tick_params(axis='x', rotation=90)
     plt.tight_layout()
         - **Hotel Categories by State:** States with more "Low Budget" and "Budget" hotels cater to cost-conscious travelers. States with more "Luxury" hotels are likely tourist hubs or cater to premium audiences.
         - **Summary:** Regional trends indicate diverse pricing and category distributions, influenced by tourism and economic conditions in different states.
     """)
     st.title("Multivariate Analysis of Hotel Data")
     # Create a subset of the data for the analysis
+    subset_data = data[['category', 'price', 'reviews', 'discount', 'cashback', 'rating']]
     # Section 1: Price vs. Reviews by Category
     st.header("Price vs. Reviews by Category")
+    fig1 = sns.catplot(data=data, x='reviews', y='price', hue='category', kind='strip', palette='Set2', height=6, aspect=1.5)
     fig1.set_axis_labels("Reviews", "Price")
     fig1.fig.suptitle('Price vs Reviews by Category', fontsize=16)
     st.pyplot(fig1)
     # Section 2: Price vs. Discount by Category
     st.header("Price vs. Discount by Category")
+    fig2 = sns.catplot(data=data, x='discount', y='price', hue='category', kind='bar', palette='Set2', height=6, aspect=1.5)
     fig2.set_axis_labels("Discount", "Price")
     fig2.fig.suptitle('Price vs Discount by Category', fontsize=16)
     st.pyplot(fig2)
     st.header("Price vs Cashback and Rating by Category")
     fig3, axes2 = plt.subplots(1, 2, figsize=(16, 6))
+    sns.stripplot(data=data, x='cashback', y='price', hue='category', ax=axes2[0], palette='Set2', jitter=True, dodge=True)
     axes2[0].set_title('Price vs Cashback by Category')
+    sns.stripplot(data=data, x='rating', y='price', hue='category', ax=axes2[1], palette='Set2', jitter=True, dodge=True)
     axes2[1].set_title('Price vs Rating by Category')
     st.pyplot(fig3)
     # Insights and analysis
     st.header("Plot-Wise Analysis Insights")
     numeric_columns = ['price', 'reviews', 'discount', 'cashback', 'rating']
     # Compute the correlation matrix
+    correlation_matrix = data[numeric_columns].corr()
     # Create a heatmap to visualize the correlation matrix
     plt.figure(figsize=(10, 8))
         - These trends represent the expected distribution of data where higher values are less frequent but are not considered outliers.
         - The variations in cancellation patterns and review counts reflect typical customer behavior and industry dynamics.
     """)
+    st.write("""
+        Since no outliers were detected, we can proceed with model training and selection.
+        With clean data, we can now focus on choosing the best algorithm, tuning hyperparameters, and evaluating model performance.
+    """)
 else: