Mpavan45 commited on
Commit
08421d4
·
verified ·
1 Parent(s): d8e0bfd

Update pages/2_Data Cleaning and Processing .py

Browse files
pages/2_Data Cleaning and Processing .py CHANGED
@@ -3,6 +3,8 @@ import pandas as pd
3
  import os
4
  from io import StringIO
5
  import sys
 
 
6
 
7
  # Page Title
8
  st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
@@ -39,7 +41,7 @@ st.markdown(
39
  <style>
40
  .stApp {{
41
  background-image: url("{background_image_url}");
42
- background-size: 100% auto; /* Ensure the image width is 100% of the screen, and the height scales proportionally */
43
  background-repeat: repeat-y; /* Repeat only vertically */
44
  background-position: top center; /* Start repeating from the top center */
45
  background-attachment: fixed; /* Keeps the background fixed as you scroll */
@@ -101,12 +103,6 @@ st.markdown(
101
  """,
102
  unsafe_allow_html=True,
103
  )
104
- import streamlit as st
105
-
106
- st.title("Data Cleaning and Processing")
107
-
108
- st.write("Perform data cleaning and preprocessing tasks here.")
109
-
110
  # Navigation Buttons
111
  st.markdown(
112
  """
@@ -117,3 +113,182 @@ st.markdown(
117
  """,
118
  unsafe_allow_html=True,
119
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  from io import StringIO
5
  import sys
6
+ import re
7
+ import numpy as np
8
 
9
  # Page Title
10
  st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
 
41
  <style>
42
  .stApp {{
43
  background-image: url("{background_image_url}");
44
+ background-size: auto; /* Ensure the image width is 100% of the screen, and the height scales proportionally */
45
  background-repeat: repeat-y; /* Repeat only vertically */
46
  background-position: top center; /* Start repeating from the top center */
47
  background-attachment: fixed; /* Keeps the background fixed as you scroll */
 
103
  """,
104
  unsafe_allow_html=True,
105
  )
 
 
 
 
 
 
106
  # Navigation Buttons
107
  st.markdown(
108
  """
 
113
  """,
114
  unsafe_allow_html=True,
115
  )
116
+ st.write("""
117
+ ### 1. **Title and File Upload**
118
+ This section sets the title of the application and includes a file uploader that allows the user to upload a CSV file for cleaning. The file should be of CSV type.
119
+ """)
120
+
121
+ st.title("Hotel Booking Data Cleaning and Analysis")
122
+ uploaded_file = st.file_uploader("Upload your CSV file for cleaning", type=["csv"])
123
+
124
+ st.write("""
125
+ ### 2. **Check if File is Uploaded**
126
+ If a file is uploaded, it is read into a pandas DataFrame for further processing and cleaning.
127
+ """)
128
+
129
+ if uploaded_file is not None:
130
+ df = pd.read_csv(uploaded_file)
131
+
132
+ st.write("""
133
+ ### 3. **Cleaning the 'Hotel Name' Column**
134
+ This section cleans the 'Hotel Name' column by:
135
+ - Removing missing values.
136
+ - Removing unwanted text such as newline characters, "View on map", and years.
137
+ - Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
138
+ """)
139
+
140
+ df['Hotel Name'].isna().sum()
141
+ df.dropna(subset=['Hotel Name'], inplace=True)
142
+ df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
143
+ df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
144
+ df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
145
+ df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
146
+ df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
147
+
148
+ st.write("""
149
+ ### 4. **Cleaning the 'Rating' Column**
150
+ This part deals with cleaning the 'Rating' column:
151
+ - Removing missing values.
152
+ - Extracting numerical values from string ratings using regular expressions.
153
+ """)
154
+
155
+ df['Rating'].isna().sum()
156
+ df.dropna(subset=['Rating'], inplace=True)
157
+
158
+ def extract_rating(rating_str):
159
+ if isinstance(rating_str, str):
160
+ match = re.search(r'(\d+)', rating_str)
161
+ if match:
162
+ return float(match.group(1))
163
+ return np.nan
164
+
165
+ df['Rating'] = df['Rating'].apply(extract_rating)
166
+
167
+ st.write("""
168
+ ### 5. **Cleaning the 'Location' Column**
169
+ - Missing values in the 'Location' column are dropped.
170
+ - The location is cleaned by extracting only words and formatting them consistently.
171
+ """)
172
+
173
+ df['Location'].isna().sum()
174
+ df.dropna(subset=['Location'], inplace=True)
175
+ df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
176
+ df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
177
+ df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
178
+
179
+ st.write("""
180
+ ### 6. **Cleaning the 'Discount' Column**
181
+ This section:
182
+ - Extracts discount values from strings.
183
+ - Replaces missing or invalid discounts with '0'.
184
+ - Adjusts discounts greater than 50 by reducing them by 50.
185
+ """)
186
+
187
+ def f(x):
188
+ return re.findall(r"\d{2}", str(x))
189
+
190
+ df.Discount = df.Discount.apply(f).str[0]
191
+ df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
192
+ df.Discount = df.Discount.apply(lambda x: int(x))
193
+ df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
194
+
195
+ st.write("""
196
+ ### 7. **Cleaning the 'Review Text' Column**
197
+ - The 'Review Text' is truncated to only include the first sentence.
198
+ - Ratings are mapped to corresponding descriptive review text.
199
+ - Invalid reviews are replaced based on conditions like missing or incomplete data.
200
+ """)
201
+
202
+ df['Review Text'].isna().sum()
203
+ df['Review Text'] = df['Review Text'].str.split('.').str[0]
204
+ df['Review Text'].replace('10', 'Exceptional', inplace=True)
205
+ df['Review Text'].replace('9', 'Excellent', inplace=True)
206
+ df['Review Text'].replace('8', 'Very Good', inplace=True)
207
+ df['Review Text'].replace('7', 'Good', inplace=True)
208
+ df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
209
+ df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
210
+
211
+ st.write("""
212
+ ### 8. **Cleaning the 'Reviews' Column**
213
+ This part ensures that:
214
+ - Missing or invalid review values in the 'Reviews' column are replaced with '0'.
215
+ """)
216
+
217
+ df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
218
+
219
+ st.write("""
220
+ ### 9. **Cleaning the 'Cashback' Column**
221
+ - Missing or invalid cashback values are replaced with '0' for consistency.
222
+ """)
223
+
224
+ df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
225
+
226
+ st.write("""
227
+ ### 10. **Cleaning the 'Cancellation' Column**
228
+ - Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
229
+ """)
230
+
231
+ df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
232
+
233
+ st.write("""
234
+ ### 11. **Cleaning the 'Price' Column**
235
+ - Commas in the 'Price' column are removed.
236
+ - Values are converted to numeric types, with errors coerced into NaN.
237
+ - Rows with missing or invalid prices are removed.
238
+ """)
239
+
240
+ df['Price'] = df['Price'].str.replace(',', '')
241
+ df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
242
+ df = df.dropna(subset=['Price'])
243
+
244
+ st.write("""
245
+ ### 12. **Adding Free Services Based on Rating**
246
+ - A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
247
+ - These services are added to the DataFrame as a new column 'Free Services'.
248
+ """)
249
+
250
+ hotel_ammenities_by_star = {
251
+ 1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
252
+ 2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
253
+ 3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
254
+ 4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
255
+ 5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
256
+ }
257
+
258
+ df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
259
+
260
+ st.write("""
261
+ ### 13. **Store Cleaned Data in Session State**
262
+ The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
263
+ """)
264
+
265
+ st.session_state.cleaned_data = df
266
+
267
+ st.write("""
268
+ ### 14. **Displaying Cleaned Data**
269
+ The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
270
+ """)
271
+
272
+ st.write("### Cleaned Data Preview")
273
+ st.dataframe(df.head())
274
+
275
+ st.write("""
276
+ ### 15. **Download Button for CSV**
277
+ A button is provided for the user to download the cleaned data as a CSV file, enabling easy access to the results.
278
+ """)
279
+
280
+ st.write("### Download the Cleaned Data")
281
+ st.download_button(
282
+ label="Download CSV",
283
+ data=df.to_csv(index=False),
284
+ file_name="Cleaned_Agoda_Data.csv",
285
+ mime="text/csv"
286
+ )
287
+
288
+ st.write("""
289
+ ### 16. **Dataset Information**
290
+ Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
291
+ """)
292
+
293
+ st.write("### Dataset Information")
294
+ st.text(df.info())