Spaces:

Mpavan45
/

Hotel_Data_Analysis

Sleeping

App Files Files Community

Mpavan45 commited on Jan 24, 2025

Commit

08421d4

verified ·

1 Parent(s): d8e0bfd

Update pages/2_Data Cleaning and Processing .py

Browse files

Files changed (1) hide show

pages/2_Data Cleaning and Processing .py +182 -7

pages/2_Data Cleaning and Processing .py CHANGED Viewed

@@ -3,6 +3,8 @@ import pandas as pd
 import os
 from io import StringIO
 import sys
 # Page Title
 st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
@@ -39,7 +41,7 @@ st.markdown(
     <style>
         .stApp {{
             background-image: url("{background_image_url}");
-            background-size: 100% auto;  /* Ensure the image width is 100% of the screen, and the height scales proportionally */
             background-repeat: repeat-y;  /* Repeat only vertically */
             background-position: top center;  /* Start repeating from the top center */
             background-attachment: fixed;  /* Keeps the background fixed as you scroll */
@@ -101,12 +103,6 @@ st.markdown(
     """,
     unsafe_allow_html=True,
 )
-import streamlit as st
-st.title("Data Cleaning and Processing")
-st.write("Perform data cleaning and preprocessing tasks here.")
 # Navigation Buttons
 st.markdown(
     """
@@ -117,3 +113,182 @@ st.markdown(
     """,
     unsafe_allow_html=True,
 )

 import os
 from io import StringIO
 import sys
+import re
+import numpy as np
 # Page Title
 st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
     <style>
         .stApp {{
             background-image: url("{background_image_url}");
+            background-size: auto;  /* Ensure the image width is 100% of the screen, and the height scales proportionally */
             background-repeat: repeat-y;  /* Repeat only vertically */
             background-position: top center;  /* Start repeating from the top center */
             background-attachment: fixed;  /* Keeps the background fixed as you scroll */
     """,
     unsafe_allow_html=True,
 )
 # Navigation Buttons
 st.markdown(
     """
     """,
     unsafe_allow_html=True,
 )
+st.write("""
+### 1. **Title and File Upload**
+This section sets the title of the application and includes a file uploader that allows the user to upload a CSV file for cleaning. The file should be of CSV type.
+""")
+st.title("Hotel Booking Data Cleaning and Analysis")
+uploaded_file = st.file_uploader("Upload your CSV file for cleaning", type=["csv"])
+st.write("""
+### 2. **Check if File is Uploaded**
+If a file is uploaded, it is read into a pandas DataFrame for further processing and cleaning.
+""")
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+st.write("""
+### 3. **Cleaning the 'Hotel Name' Column**
+This section cleans the 'Hotel Name' column by:
+- Removing missing values.
+- Removing unwanted text such as newline characters, "View on map", and years.
+- Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
+""")
+df['Hotel Name'].isna().sum()
+df.dropna(subset=['Hotel Name'], inplace=True)
+df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
+df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
+df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
+df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
+df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
+st.write("""
+### 4. **Cleaning the 'Rating' Column**
+This part deals with cleaning the 'Rating' column:
+- Removing missing values.
+- Extracting numerical values from string ratings using regular expressions.
+""")
+df['Rating'].isna().sum()
+df.dropna(subset=['Rating'], inplace=True)
+def extract_rating(rating_str):
+    if isinstance(rating_str, str):
+        match = re.search(r'(\d+)', rating_str)
+        if match:
+            return float(match.group(1))
+    return np.nan
+df['Rating'] = df['Rating'].apply(extract_rating)
+st.write("""
+### 5. **Cleaning the 'Location' Column**
+- Missing values in the 'Location' column are dropped.
+- The location is cleaned by extracting only words and formatting them consistently.
+""")
+df['Location'].isna().sum()
+df.dropna(subset=['Location'], inplace=True)
+df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
+df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
+df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
+st.write("""
+### 6. **Cleaning the 'Discount' Column**
+This section:
+- Extracts discount values from strings.
+- Replaces missing or invalid discounts with '0'.
+- Adjusts discounts greater than 50 by reducing them by 50.
+""")
+def f(x):
+    return re.findall(r"\d{2}", str(x))
+df.Discount = df.Discount.apply(f).str[0]
+df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
+df.Discount = df.Discount.apply(lambda x: int(x))
+df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
+st.write("""
+### 7. **Cleaning the 'Review Text' Column**
+- The 'Review Text' is truncated to only include the first sentence.
+- Ratings are mapped to corresponding descriptive review text.
+- Invalid reviews are replaced based on conditions like missing or incomplete data.
+""")
+df['Review Text'].isna().sum()
+df['Review Text'] = df['Review Text'].str.split('.').str[0]
+df['Review Text'].replace('10', 'Exceptional', inplace=True)
+df['Review Text'].replace('9', 'Excellent', inplace=True)
+df['Review Text'].replace('8', 'Very Good', inplace=True)
+df['Review Text'].replace('7', 'Good', inplace=True)
+df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
+df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
+st.write("""
+### 8. **Cleaning the 'Reviews' Column**
+This part ensures that:
+- Missing or invalid review values in the 'Reviews' column are replaced with '0'.
+""")
+df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
+st.write("""
+### 9. **Cleaning the 'Cashback' Column**
+- Missing or invalid cashback values are replaced with '0' for consistency.
+""")
+df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
+st.write("""
+### 10. **Cleaning the 'Cancellation' Column**
+- Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
+""")
+df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
+st.write("""
+### 11. **Cleaning the 'Price' Column**
+- Commas in the 'Price' column are removed.
+- Values are converted to numeric types, with errors coerced into NaN.
+- Rows with missing or invalid prices are removed.
+""")
+df['Price'] = df['Price'].str.replace(',', '')
+df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
+df = df.dropna(subset=['Price'])
+st.write("""
+### 12. **Adding Free Services Based on Rating**
+- A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
+- These services are added to the DataFrame as a new column 'Free Services'.
+""")
+hotel_ammenities_by_star = {
+    1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
+    2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
+    3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
+    4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
+    5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
+}
+df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
+st.write("""
+### 13. **Store Cleaned Data in Session State**
+The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
+""")
+st.session_state.cleaned_data = df
+st.write("""
+### 14. **Displaying Cleaned Data**
+The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
+""")
+st.write("### Cleaned Data Preview")
+st.dataframe(df.head())
+st.write("""
+### 15. **Download Button for CSV**
+A button is provided for the user to download the cleaned data as a CSV file, enabling easy access to the results.
+""")
+st.write("### Download the Cleaned Data")
+st.download_button(
+    label="Download CSV",
+    data=df.to_csv(index=False),
+    file_name="Cleaned_Agoda_Data.csv",
+    mime="text/csv"
+)
+st.write("""
+### 16. **Dataset Information**
+Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
+""")
+st.write("### Dataset Information")
+st.text(df.info())