Spaces:

Mpavan45
/

Hotel_Data_Analysis

Sleeping

App Files Files Community

Mpavan45 commited on Jan 24, 2025

Commit

ad49f99

verified ·

1 Parent(s): 9a7d598

Update pages/2_Data Cleaning and Processing .py

Browse files

Files changed (1) hide show

pages/2_Data Cleaning and Processing .py +77 -7

pages/2_Data Cleaning and Processing .py CHANGED Viewed

@@ -17,7 +17,6 @@ if df is not None:
     st.subheader("Dataset Preview:")
     st.write(df.head())  # Display the first 5 rows
-    st.subheader("Info of the Dataset:")
     # Redirect the output of df.info() to a string buffer
     buffer = StringIO()
     df.info(buf=buffer)
@@ -39,6 +38,13 @@ if df is not None:
     st.write("Cleaned 'Hotel Name' Column:")
     st.write(df[['Hotel Name']].head())
     # Cleaning the 'Rating' column
     st.subheader("Cleaning the 'Rating' Column:")
@@ -53,6 +59,12 @@ if df is not None:
     st.write("Cleaned 'Rating' Column:")
     st.write(df[['Rating']].head())
     # Cleaning the 'Location' column
     st.subheader("Cleaning the 'Location' Column:")
@@ -63,7 +75,11 @@ if df is not None:
     st.write("Cleaned 'Location' Column:")
     st.write(df[['Location']].head())
     # Cleaning the 'Discount' column
     st.subheader("Cleaning the 'Discount' Column:")
     def f(x):
@@ -76,6 +92,13 @@ if df is not None:
     st.write("Cleaned 'Discount' Column:")
     st.write(df[['Discount']].head())
     # Cleaning the 'Review Text' column
     st.subheader("Cleaning the 'Review Text' Column:")
@@ -89,6 +112,12 @@ if df is not None:
     st.write("Cleaned 'Review Text' Column:")
     st.write(df[['Review Text']].head())
     # Cleaning the 'Reviews' column
     st.subheader("Cleaning the 'Reviews' Column:")
@@ -97,26 +126,44 @@ if df is not None:
     st.write("Cleaned 'Reviews' Column:")
     st.write(df[['Reviews']].head())
     # Cleaning the 'Cashback' column
     st.subheader("Cleaning the 'Cashback' Column:")
     df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
     st.write("Cleaned 'Cashback' Column:")
     st.write(df[['Cashback']].head())
     # Cleaning the 'Cancellation' column
     st.subheader("Cleaning the 'Cancellation' Column:")
     df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
     st.write("Cleaned 'Cancellation' Column:")
     st.write(df[['Cancellation']].head())
     # Cleaning the 'Price' column
     st.subheader("Cleaning the 'Price' Column:")
     df['Price'] = df['Price'].str.replace(',', '')
     df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
     df = df.dropna(subset=['Price'])
     st.write("Cleaned 'Price' Column:")
     st.write(df[['Price']].head())
@@ -134,14 +181,36 @@ if df is not None:
     st.write("Added Free Services Based on Rating:")
     st.write(df[['Free Services']].head())
     # Store the cleaned data in session state
     st.session_state.cleaned_data = df
     # Display cleaned data
     st.subheader("Cleaned Data Preview:")
-    st.dataframe(df.head())
     # Save cleaned data to CSV and allow the user to download it
     st.subheader("Download the Cleaned Data")
     st.download_button(
@@ -151,6 +220,7 @@ if df is not None:
         mime="text/csv"
     )
 else:
     st.warning("No dataset found. Please upload a dataset on the Home page.")

     st.subheader("Dataset Preview:")
     st.write(df.head())  # Display the first 5 rows
     # Redirect the output of df.info() to a string buffer
     buffer = StringIO()
     df.info(buf=buffer)
     st.write("Cleaned 'Hotel Name' Column:")
     st.write(df[['Hotel Name']].head())
+    st.write("""
+    ### 1. **Cleaning the 'Hotel Name' Column**
+    This section cleans the 'Hotel Name' column by:
+    - Removing missing values.
+    - Removing unwanted text such as newline characters, "View on map", and years.
+    - Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
+    """)
     # Cleaning the 'Rating' column
     st.subheader("Cleaning the 'Rating' Column:")
     st.write("Cleaned 'Rating' Column:")
     st.write(df[['Rating']].head())
+    st.write("""
+    ### 2. **Cleaning the 'Rating' Column**
+    This part deals with cleaning the 'Rating' column:
+    - Removing missing values.
+    - Extracting numerical values from string ratings using regular expressions.
+    """)
     # Cleaning the 'Location' column
     st.subheader("Cleaning the 'Location' Column:")
     st.write("Cleaned 'Location' Column:")
     st.write(df[['Location']].head())
+    st.write("""
+    ### 3. **Cleaning the 'Location' Column**
+    - Missing values in the 'Location' column are dropped.
+    - The location is cleaned by extracting only words and formatting them consistently.
+    """)
     # Cleaning the 'Discount' column
     st.subheader("Cleaning the 'Discount' Column:")
     def f(x):
     st.write("Cleaned 'Discount' Column:")
     st.write(df[['Discount']].head())
+    st.write("""
+    ### 4. **Cleaning the 'Discount' Column**
+    This section:
+    - Extracts discount values from strings.
+    - Replaces missing or invalid discounts with '0'.
+    - Adjusts discounts greater than 50 by reducing them by 50.
+    """)
     # Cleaning the 'Review Text' column
     st.subheader("Cleaning the 'Review Text' Column:")
     st.write("Cleaned 'Review Text' Column:")
     st.write(df[['Review Text']].head())
+    st.write("""
+    ### 5. **Cleaning the 'Review Text' Column**
+    - The 'Review Text' is truncated to only include the first sentence.
+    - Ratings are mapped to corresponding descriptive review text.
+    - Invalid reviews are replaced based on conditions like missing or incomplete data.
+    """)
     # Cleaning the 'Reviews' column
     st.subheader("Cleaning the 'Reviews' Column:")
     st.write("Cleaned 'Reviews' Column:")
     st.write(df[['Reviews']].head())
+    st.write("""
+    ### 6. **Cleaning the 'Reviews' Column**
+    This part ensures that:
+    - Missing or invalid review values in the 'Reviews' column are replaced with '0'.
+    """)
     # Cleaning the 'Cashback' column
     st.subheader("Cleaning the 'Cashback' Column:")
     df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
     st.write("Cleaned 'Cashback' Column:")
     st.write(df[['Cashback']].head())
+    st.write("""
+    ### 7. **Cleaning the 'Cashback' Column**
+    - Missing or invalid cashback values are replaced with '0' for consistency.
+    """)
     # Cleaning the 'Cancellation' column
     st.subheader("Cleaning the 'Cancellation' Column:")
     df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
     st.write("Cleaned 'Cancellation' Column:")
     st.write(df[['Cancellation']].head())
+    st.write("""
+    ### 8. **Cleaning the 'Cancellation' Column**
+    - Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
+    """)
     # Cleaning the 'Price' column
     st.subheader("Cleaning the 'Price' Column:")
     df['Price'] = df['Price'].str.replace(',', '')
     df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
     df = df.dropna(subset=['Price'])
+    st.write("""
+    ### 9. **Cleaning the 'Price' Column**
+    - Commas in the 'Price' column are removed.
+    - Values are converted to numeric types, with errors coerced into NaN.
+    - Rows with missing or invalid prices are removed.
+    """)
     st.write("Cleaned 'Price' Column:")
     st.write(df[['Price']].head())
     st.write("Added Free Services Based on Rating:")
     st.write(df[['Free Services']].head())
+    st.write("""
+    ### 10. **Adding Free Services Based on Rating**
+    - A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
+    - These services are added to the DataFrame as a new column 'Free Services'.
+    """)
     # Store the cleaned data in session state
     st.session_state.cleaned_data = df
+    st.write("""
+    ### 11. **Store Cleaned Data in Session State**
+    The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
+    """)
     # Display cleaned data
     st.subheader("Cleaned Data Preview:")
+    st.dataframe(df)
+    st.write("""
+    ### 12. **Dataset Information**
+    Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
+    """)
+    buffer = StringIO()
+    df.info(buf=buffer)
+    # Display the content in Streamlit as Markdown
+    st.subheader("Info of the Dataset:")
+    st.markdown(f"```{buffer.getvalue()}```")
+    st.write("""
+    ### 13. **Displaying Cleaned Data**
+    The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
+    """)
     # Save cleaned data to CSV and allow the user to download it
     st.subheader("Download the Cleaned Data")
     st.download_button(
         mime="text/csv"
     )
 else:
     st.warning("No dataset found. Please upload a dataset on the Home page.")