Spaces:

Mpavan45
/

Hotel_Data_Analysis

Sleeping

App Files Files Community

Mpavan45 commited on Jan 24, 2025

Commit

2236f57

verified ·

1 Parent(s): f442fb7

Update pages/2_Data Cleaning and Processing .py

Browse files

Files changed (1) hide show

pages/2_Data Cleaning and Processing .py +130 -217

pages/2_Data Cleaning and Processing .py CHANGED Viewed

@@ -2,7 +2,6 @@ import streamlit as st
 import pandas as pd
 import os
 from io import StringIO
-import sys
 import re
 import numpy as np
@@ -13,26 +12,147 @@ st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Pro
 df = st.session_state.get("dataset")
 if df is not None:
     st.subheader("Dataset Preview:")
-    st.write(df.head())
     st.subheader("Info of the Dataset:")
     # Redirect the output of df.info() to a string buffer
     buffer = StringIO()
     df.info(buf=buffer)
-    # Display the content in Streamlit
-    st.write(buffer.getvalue())
-    st.subheader("Dataset Description:")
-    st.write(df.describe())
-    st.subheader("Shape of the Dataset:")
     st.write(df.shape)
 else:
     st.warning("No dataset found. Please upload a dataset on the Home page.")
 # Define the URL of the background image (use your own image URL)
 # Apply custom CSS for the background image and overlay
 background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/675fab3a2d0851e23d23cad3/MI0hTKaf1a2EmxUfA6TsV.png"
@@ -74,35 +194,7 @@ st.markdown(
     """,
     unsafe_allow_html=True
 )
-st.markdown(
-    """
-    <style>
-    .custom-button {
-        display: inline-block;
-        padding: 5px 10px;
-        font-size: 14px;
-        color: #ffffff;
-        background-color: #4CAF50;
-        border: none;
-        border-radius: 5px;
-        text-align: center;
-        text-decoration: none;
-        transition: background-color 0.3s ease, transform 0.2s ease;
-        cursor: pointer;
-    }
-    .custom-button:hover {
-        background-color: #45a049;
-        transform: scale(1.05);
-    }
-    .button-container {
-        display: flex;
-        justify-content: space-between;
-        margin-top: 20px;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True,
-)
 # Navigation Buttons
 st.markdown(
     """
@@ -113,182 +205,3 @@ st.markdown(
     """,
     unsafe_allow_html=True,
 )
-st.write("""
-### 1. **Title and File Upload**
-This section sets the title of the application and includes a file uploader that allows the user to upload a CSV file for cleaning. The file should be of CSV type.
-""")
-st.title("Hotel Booking Data Cleaning and Analysis")
-uploaded_file = st.file_uploader("Upload your CSV file for cleaning", type=["csv"])
-st.write("""
-### 2. **Check if File is Uploaded**
-If a file is uploaded, it is read into a pandas DataFrame for further processing and cleaning.
-""")
-if uploaded_file is not None:
-    df = pd.read_csv(uploaded_file)
-st.write("""
-### 3. **Cleaning the 'Hotel Name' Column**
-This section cleans the 'Hotel Name' column by:
-- Removing missing values.
-- Removing unwanted text such as newline characters, "View on map", and years.
-- Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
-""")
-df['Hotel Name'].isna().sum()
-df.dropna(subset=['Hotel Name'], inplace=True)
-df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
-df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
-df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
-df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
-df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
-st.write("""
-### 4. **Cleaning the 'Rating' Column**
-This part deals with cleaning the 'Rating' column:
-- Removing missing values.
-- Extracting numerical values from string ratings using regular expressions.
-""")
-df['Rating'].isna().sum()
-df.dropna(subset=['Rating'], inplace=True)
-def extract_rating(rating_str):
-    if isinstance(rating_str, str):
-        match = re.search(r'(\d+)', rating_str)
-        if match:
-            return float(match.group(1))
-    return np.nan
-df['Rating'] = df['Rating'].apply(extract_rating)
-st.write("""
-### 5. **Cleaning the 'Location' Column**
-- Missing values in the 'Location' column are dropped.
-- The location is cleaned by extracting only words and formatting them consistently.
-""")
-df['Location'].isna().sum()
-df.dropna(subset=['Location'], inplace=True)
-df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
-df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
-df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
-st.write("""
-### 6. **Cleaning the 'Discount' Column**
-This section:
-- Extracts discount values from strings.
-- Replaces missing or invalid discounts with '0'.
-- Adjusts discounts greater than 50 by reducing them by 50.
-""")
-def f(x):
-    return re.findall(r"\d{2}", str(x))
-df.Discount = df.Discount.apply(f).str[0]
-df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
-df.Discount = df.Discount.apply(lambda x: int(x))
-df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
-st.write("""
-### 7. **Cleaning the 'Review Text' Column**
-- The 'Review Text' is truncated to only include the first sentence.
-- Ratings are mapped to corresponding descriptive review text.
-- Invalid reviews are replaced based on conditions like missing or incomplete data.
-""")
-df['Review Text'].isna().sum()
-df['Review Text'] = df['Review Text'].str.split('.').str[0]
-df['Review Text'].replace('10', 'Exceptional', inplace=True)
-df['Review Text'].replace('9', 'Excellent', inplace=True)
-df['Review Text'].replace('8', 'Very Good', inplace=True)
-df['Review Text'].replace('7', 'Good', inplace=True)
-df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
-df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
-st.write("""
-### 8. **Cleaning the 'Reviews' Column**
-This part ensures that:
-- Missing or invalid review values in the 'Reviews' column are replaced with '0'.
-""")
-df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
-st.write("""
-### 9. **Cleaning the 'Cashback' Column**
-- Missing or invalid cashback values are replaced with '0' for consistency.
-""")
-df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
-st.write("""
-### 10. **Cleaning the 'Cancellation' Column**
-- Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
-""")
-df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
-st.write("""
-### 11. **Cleaning the 'Price' Column**
-- Commas in the 'Price' column are removed.
-- Values are converted to numeric types, with errors coerced into NaN.
-- Rows with missing or invalid prices are removed.
-""")
-df['Price'] = df['Price'].str.replace(',', '')
-df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
-df = df.dropna(subset=['Price'])
-st.write("""
-### 12. **Adding Free Services Based on Rating**
-- A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
-- These services are added to the DataFrame as a new column 'Free Services'.
-""")
-hotel_ammenities_by_star = {
-    1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
-    2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
-    3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
-    4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
-    5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
-}
-df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
-st.write("""
-### 13. **Store Cleaned Data in Session State**
-The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
-""")
-st.session_state.cleaned_data = df
-st.write("""
-### 14. **Displaying Cleaned Data**
-The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
-""")
-st.write("### Cleaned Data Preview")
-st.dataframe(df.head())
-st.write("""
-### 15. **Download Button for CSV**
-A button is provided for the user to download the cleaned data as a CSV file, enabling easy access to the results.
-""")
-st.write("### Download the Cleaned Data")
-st.download_button(
-    label="Download CSV",
-    data=df.to_csv(index=False),
-    file_name="Cleaned_Agoda_Data.csv",
-    mime="text/csv"
-)
-st.write("""
-### 16. **Dataset Information**
-Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
-""")
-st.write("### Dataset Information")
-st.text(df.info())

 import pandas as pd
 import os
 from io import StringIO
 import re
 import numpy as np
 df = st.session_state.get("dataset")
 if df is not None:
+    # Dataset Preview
     st.subheader("Dataset Preview:")
+    st.write(df.head())  # Display the first 5 rows
     st.subheader("Info of the Dataset:")
     # Redirect the output of df.info() to a string buffer
     buffer = StringIO()
     df.info(buf=buffer)
+    # Display the content in Streamlit as Markdown
+    st.subheader("Info of the Dataset:")
+    st.markdown(f"```{buffer.getvalue()}```")
+    st.subheader("Dataset Shape (Rows, Columns):")
     st.write(df.shape)
+    # Cleaning the 'Hotel Name' column
+    st.subheader("Cleaning the 'Hotel Name' Column:")
+    df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
+    df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
+    df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
+    df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
+    df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
+    st.write("Cleaned 'Hotel Name' Column:")
+    st.write(df[['Hotel Name']].head())
+    # Cleaning the 'Rating' column
+    st.subheader("Cleaning the 'Rating' Column:")
+    def extract_rating(rating_str):
+        if isinstance(rating_str, str):
+            match = re.search(r'(\d+)', rating_str)
+            if match:
+                return float(match.group(1))
+        return np.nan
+    df['Rating'] = df['Rating'].apply(extract_rating)
+    st.write("Cleaned 'Rating' Column:")
+    st.write(df[['Rating']].head())
+    # Cleaning the 'Location' column
+    st.subheader("Cleaning the 'Location' Column:")
+    df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
+    df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
+    df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
+    st.write("Cleaned 'Location' Column:")
+    st.write(df[['Location']].head())
+    # Cleaning the 'Discount' column
+    st.subheader("Cleaning the 'Discount' Column:")
+    def f(x):
+        return re.findall(r"\d{2}", str(x))
+    df.Discount = df.Discount.apply(f).str[0]
+    df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
+    df.Discount = df.Discount.apply(lambda x: int(x))
+    df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
+    st.write("Cleaned 'Discount' Column:")
+    st.write(df[['Discount']].head())
+    # Cleaning the 'Review Text' column
+    st.subheader("Cleaning the 'Review Text' Column:")
+    df['Review Text'] = df['Review Text'].str.split('.').str[0]
+    df['Review Text'].replace('10', 'Exceptional', inplace=True)
+    df['Review Text'].replace('9', 'Excellent', inplace=True)
+    df['Review Text'].replace('8', 'Very Good', inplace=True)
+    df['Review Text'].replace('7', 'Good', inplace=True)
+    df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
+    df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
+    st.write("Cleaned 'Review Text' Column:")
+    st.write(df[['Review Text']].head())
+    # Cleaning the 'Reviews' column
+    st.subheader("Cleaning the 'Reviews' Column:")
+    df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
+    st.write("Cleaned 'Reviews' Column:")
+    st.write(df[['Reviews']].head())
+    # Cleaning the 'Cashback' column
+    st.subheader("Cleaning the 'Cashback' Column:")
+    df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
+    st.write("Cleaned 'Cashback' Column:")
+    st.write(df[['Cashback']].head())
+    # Cleaning the 'Cancellation' column
+    st.subheader("Cleaning the 'Cancellation' Column:")
+    df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
+    st.write("Cleaned 'Cancellation' Column:")
+    st.write(df[['Cancellation']].head())
+    # Cleaning the 'Price' column
+    st.subheader("Cleaning the 'Price' Column:")
+    df['Price'] = df['Price'].str.replace(',', '')
+    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
+    df = df.dropna(subset=['Price'])
+    st.write("Cleaned 'Price' Column:")
+    st.write(df[['Price']].head())
+    # Adding Free Services based on Rating
+    st.subheader("Adding Free Services Based on Rating:")
+    hotel_ammenities_by_star = {
+        1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
+        2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
+        3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
+        4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
+        5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
+    }
+    df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
+    st.write("Added Free Services Based on Rating:")
+    st.write(df[['Free Services']].head())
+    # Store the cleaned data in session state
+    st.session_state.cleaned_data = df
+    # Display cleaned data
+    st.subheader("Cleaned Data Preview:")
+    st.dataframe(df.head())
+    # Save cleaned data to CSV and allow the user to download it
+    st.subheader("Download the Cleaned Data")
+    st.download_button(
+        label="Download CSV",
+        data=df.to_csv(index=False),
+        file_name="Cleaned_Hotel_Data.csv",
+        mime="text/csv"
+    )
 else:
     st.warning("No dataset found. Please upload a dataset on the Home page.")
 # Define the URL of the background image (use your own image URL)
 # Apply custom CSS for the background image and overlay
 background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/675fab3a2d0851e23d23cad3/MI0hTKaf1a2EmxUfA6TsV.png"
     """,
     unsafe_allow_html=True
 )
 # Navigation Buttons
 st.markdown(
     """
     """,
     unsafe_allow_html=True,
 )