Mpavan45 commited on
Commit
2236f57
·
verified ·
1 Parent(s): f442fb7

Update pages/2_Data Cleaning and Processing .py

Browse files
Files changed (1) hide show
  1. pages/2_Data Cleaning and Processing .py +130 -217
pages/2_Data Cleaning and Processing .py CHANGED
@@ -2,7 +2,6 @@ import streamlit as st
2
  import pandas as pd
3
  import os
4
  from io import StringIO
5
- import sys
6
  import re
7
  import numpy as np
8
 
@@ -13,26 +12,147 @@ st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Pro
13
  df = st.session_state.get("dataset")
14
 
15
  if df is not None:
 
 
16
  st.subheader("Dataset Preview:")
17
- st.write(df.head())
18
 
19
  st.subheader("Info of the Dataset:")
20
  # Redirect the output of df.info() to a string buffer
21
  buffer = StringIO()
22
  df.info(buf=buffer)
23
-
24
- # Display the content in Streamlit
25
- st.write(buffer.getvalue())
26
 
27
- st.subheader("Dataset Description:")
28
- st.write(df.describe())
 
29
 
30
- st.subheader("Shape of the Dataset:")
31
  st.write(df.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  else:
33
  st.warning("No dataset found. Please upload a dataset on the Home page.")
34
 
35
-
36
  # Define the URL of the background image (use your own image URL)
37
  # Apply custom CSS for the background image and overlay
38
  background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/675fab3a2d0851e23d23cad3/MI0hTKaf1a2EmxUfA6TsV.png"
@@ -74,35 +194,7 @@ st.markdown(
74
  """,
75
  unsafe_allow_html=True
76
  )
77
- st.markdown(
78
- """
79
- <style>
80
- .custom-button {
81
- display: inline-block;
82
- padding: 5px 10px;
83
- font-size: 14px;
84
- color: #ffffff;
85
- background-color: #4CAF50;
86
- border: none;
87
- border-radius: 5px;
88
- text-align: center;
89
- text-decoration: none;
90
- transition: background-color 0.3s ease, transform 0.2s ease;
91
- cursor: pointer;
92
- }
93
- .custom-button:hover {
94
- background-color: #45a049;
95
- transform: scale(1.05);
96
- }
97
- .button-container {
98
- display: flex;
99
- justify-content: space-between;
100
- margin-top: 20px;
101
- }
102
- </style>
103
- """,
104
- unsafe_allow_html=True,
105
- )
106
  # Navigation Buttons
107
  st.markdown(
108
  """
@@ -113,182 +205,3 @@ st.markdown(
113
  """,
114
  unsafe_allow_html=True,
115
  )
116
- st.write("""
117
- ### 1. **Title and File Upload**
118
- This section sets the title of the application and includes a file uploader that allows the user to upload a CSV file for cleaning. The file should be of CSV type.
119
- """)
120
-
121
- st.title("Hotel Booking Data Cleaning and Analysis")
122
- uploaded_file = st.file_uploader("Upload your CSV file for cleaning", type=["csv"])
123
-
124
- st.write("""
125
- ### 2. **Check if File is Uploaded**
126
- If a file is uploaded, it is read into a pandas DataFrame for further processing and cleaning.
127
- """)
128
-
129
- if uploaded_file is not None:
130
- df = pd.read_csv(uploaded_file)
131
-
132
- st.write("""
133
- ### 3. **Cleaning the 'Hotel Name' Column**
134
- This section cleans the 'Hotel Name' column by:
135
- - Removing missing values.
136
- - Removing unwanted text such as newline characters, "View on map", and years.
137
- - Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
138
- """)
139
-
140
- df['Hotel Name'].isna().sum()
141
- df.dropna(subset=['Hotel Name'], inplace=True)
142
- df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
143
- df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
144
- df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
145
- df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
146
- df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
147
-
148
- st.write("""
149
- ### 4. **Cleaning the 'Rating' Column**
150
- This part deals with cleaning the 'Rating' column:
151
- - Removing missing values.
152
- - Extracting numerical values from string ratings using regular expressions.
153
- """)
154
-
155
- df['Rating'].isna().sum()
156
- df.dropna(subset=['Rating'], inplace=True)
157
-
158
- def extract_rating(rating_str):
159
- if isinstance(rating_str, str):
160
- match = re.search(r'(\d+)', rating_str)
161
- if match:
162
- return float(match.group(1))
163
- return np.nan
164
-
165
- df['Rating'] = df['Rating'].apply(extract_rating)
166
-
167
- st.write("""
168
- ### 5. **Cleaning the 'Location' Column**
169
- - Missing values in the 'Location' column are dropped.
170
- - The location is cleaned by extracting only words and formatting them consistently.
171
- """)
172
-
173
- df['Location'].isna().sum()
174
- df.dropna(subset=['Location'], inplace=True)
175
- df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
176
- df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
177
- df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
178
-
179
- st.write("""
180
- ### 6. **Cleaning the 'Discount' Column**
181
- This section:
182
- - Extracts discount values from strings.
183
- - Replaces missing or invalid discounts with '0'.
184
- - Adjusts discounts greater than 50 by reducing them by 50.
185
- """)
186
-
187
- def f(x):
188
- return re.findall(r"\d{2}", str(x))
189
-
190
- df.Discount = df.Discount.apply(f).str[0]
191
- df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
192
- df.Discount = df.Discount.apply(lambda x: int(x))
193
- df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
194
-
195
- st.write("""
196
- ### 7. **Cleaning the 'Review Text' Column**
197
- - The 'Review Text' is truncated to only include the first sentence.
198
- - Ratings are mapped to corresponding descriptive review text.
199
- - Invalid reviews are replaced based on conditions like missing or incomplete data.
200
- """)
201
-
202
- df['Review Text'].isna().sum()
203
- df['Review Text'] = df['Review Text'].str.split('.').str[0]
204
- df['Review Text'].replace('10', 'Exceptional', inplace=True)
205
- df['Review Text'].replace('9', 'Excellent', inplace=True)
206
- df['Review Text'].replace('8', 'Very Good', inplace=True)
207
- df['Review Text'].replace('7', 'Good', inplace=True)
208
- df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
209
- df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
210
-
211
- st.write("""
212
- ### 8. **Cleaning the 'Reviews' Column**
213
- This part ensures that:
214
- - Missing or invalid review values in the 'Reviews' column are replaced with '0'.
215
- """)
216
-
217
- df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
218
-
219
- st.write("""
220
- ### 9. **Cleaning the 'Cashback' Column**
221
- - Missing or invalid cashback values are replaced with '0' for consistency.
222
- """)
223
-
224
- df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
225
-
226
- st.write("""
227
- ### 10. **Cleaning the 'Cancellation' Column**
228
- - Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
229
- """)
230
-
231
- df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
232
-
233
- st.write("""
234
- ### 11. **Cleaning the 'Price' Column**
235
- - Commas in the 'Price' column are removed.
236
- - Values are converted to numeric types, with errors coerced into NaN.
237
- - Rows with missing or invalid prices are removed.
238
- """)
239
-
240
- df['Price'] = df['Price'].str.replace(',', '')
241
- df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
242
- df = df.dropna(subset=['Price'])
243
-
244
- st.write("""
245
- ### 12. **Adding Free Services Based on Rating**
246
- - A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
247
- - These services are added to the DataFrame as a new column 'Free Services'.
248
- """)
249
-
250
- hotel_ammenities_by_star = {
251
- 1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
252
- 2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
253
- 3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
254
- 4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
255
- 5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
256
- }
257
-
258
- df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
259
-
260
- st.write("""
261
- ### 13. **Store Cleaned Data in Session State**
262
- The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
263
- """)
264
-
265
- st.session_state.cleaned_data = df
266
-
267
- st.write("""
268
- ### 14. **Displaying Cleaned Data**
269
- The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
270
- """)
271
-
272
- st.write("### Cleaned Data Preview")
273
- st.dataframe(df.head())
274
-
275
- st.write("""
276
- ### 15. **Download Button for CSV**
277
- A button is provided for the user to download the cleaned data as a CSV file, enabling easy access to the results.
278
- """)
279
-
280
- st.write("### Download the Cleaned Data")
281
- st.download_button(
282
- label="Download CSV",
283
- data=df.to_csv(index=False),
284
- file_name="Cleaned_Agoda_Data.csv",
285
- mime="text/csv"
286
- )
287
-
288
- st.write("""
289
- ### 16. **Dataset Information**
290
- Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
291
- """)
292
-
293
- st.write("### Dataset Information")
294
- st.text(df.info())
 
2
  import pandas as pd
3
  import os
4
  from io import StringIO
 
5
  import re
6
  import numpy as np
7
 
 
12
  df = st.session_state.get("dataset")
13
 
14
  if df is not None:
15
+
16
+ # Dataset Preview
17
  st.subheader("Dataset Preview:")
18
+ st.write(df.head()) # Display the first 5 rows
19
 
20
  st.subheader("Info of the Dataset:")
21
  # Redirect the output of df.info() to a string buffer
22
  buffer = StringIO()
23
  df.info(buf=buffer)
 
 
 
24
 
25
+ # Display the content in Streamlit as Markdown
26
+ st.subheader("Info of the Dataset:")
27
+ st.markdown(f"```{buffer.getvalue()}```")
28
 
29
+ st.subheader("Dataset Shape (Rows, Columns):")
30
  st.write(df.shape)
31
+
32
+ # Cleaning the 'Hotel Name' column
33
+ st.subheader("Cleaning the 'Hotel Name' Column:")
34
+ df['Hotel Name'] = df['Hotel Name'].str.replace('\n', ',')
35
+ df['Hotel Name'] = df['Hotel Name'].str.replace('-View on map', '')
36
+ df['Hotel Name'] = df['Hotel Name'].str.replace('View on map', '')
37
+ df['Hotel Name'] = df['Hotel Name'].str.replace('-', '')
38
+ df.drop(index=df[df['Hotel Name'].isin(['2021', '2022', '2023'])].index, inplace=True)
39
+
40
+ st.write("Cleaned 'Hotel Name' Column:")
41
+ st.write(df[['Hotel Name']].head())
42
+
43
+ # Cleaning the 'Rating' column
44
+ st.subheader("Cleaning the 'Rating' Column:")
45
+ def extract_rating(rating_str):
46
+ if isinstance(rating_str, str):
47
+ match = re.search(r'(\d+)', rating_str)
48
+ if match:
49
+ return float(match.group(1))
50
+ return np.nan
51
+
52
+ df['Rating'] = df['Rating'].apply(extract_rating)
53
+
54
+ st.write("Cleaned 'Rating' Column:")
55
+ st.write(df[['Rating']].head())
56
+
57
+ # Cleaning the 'Location' column
58
+ st.subheader("Cleaning the 'Location' Column:")
59
+ df['Location'] = df['Location'].apply(lambda x: re.findall(r'\w+', x))
60
+ df['Location'] = df['Location'].apply(lambda x: ' '.join(x))
61
+ df['Location'] = df['Location'].apply(lambda x: re.sub(r"\d+", r",\g<0>", x))
62
+
63
+ st.write("Cleaned 'Location' Column:")
64
+ st.write(df[['Location']].head())
65
+
66
+ # Cleaning the 'Discount' column
67
+ st.subheader("Cleaning the 'Discount' Column:")
68
+ def f(x):
69
+ return re.findall(r"\d{2}", str(x))
70
+
71
+ df.Discount = df.Discount.apply(f).str[0]
72
+ df['Discount'] = df.apply(lambda row: '0' if (str(row['Discount']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Discount'])) else row['Discount'], axis=1)
73
+ df.Discount = df.Discount.apply(lambda x: int(x))
74
+ df.Discount = df.Discount.apply(lambda x: x - 50 if x > 50 else x)
75
+
76
+ st.write("Cleaned 'Discount' Column:")
77
+ st.write(df[['Discount']].head())
78
+
79
+ # Cleaning the 'Review Text' column
80
+ st.subheader("Cleaning the 'Review Text' Column:")
81
+ df['Review Text'] = df['Review Text'].str.split('.').str[0]
82
+ df['Review Text'].replace('10', 'Exceptional', inplace=True)
83
+ df['Review Text'].replace('9', 'Excellent', inplace=True)
84
+ df['Review Text'].replace('8', 'Very Good', inplace=True)
85
+ df['Review Text'].replace('7', 'Good', inplace=True)
86
+ df['Review Text'].replace(['2', '4', '5', '6'], 'Bad', inplace=True)
87
+ df['Review Text'] = df.apply(lambda row: 'Exceptional' if (row['Rating'] == 5) and (row['Review Text'] in ['Nan', 'np', 'km', 'stars', 'Review'] or pd.isnull(row['Review Text'])) else row['Review Text'], axis=1)
88
+
89
+ st.write("Cleaned 'Review Text' Column:")
90
+ st.write(df[['Review Text']].head())
91
+
92
+ # Cleaning the 'Reviews' column
93
+ st.subheader("Cleaning the 'Reviews' Column:")
94
+ df['Reviews'] = df.apply(lambda row: '0' if (str(row['Reviews']) in ['nan', 'np', 'np.nan']) else row['Reviews'], axis=1)
95
+
96
+ st.write("Cleaned 'Reviews' Column:")
97
+ st.write(df[['Reviews']].head())
98
+
99
+ # Cleaning the 'Cashback' column
100
+ st.subheader("Cleaning the 'Cashback' Column:")
101
+ df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
102
+
103
+ st.write("Cleaned 'Cashback' Column:")
104
+ st.write(df[['Cashback']].head())
105
+
106
+ # Cleaning the 'Cancellation' column
107
+ st.subheader("Cleaning the 'Cancellation' Column:")
108
+ df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
109
+
110
+ st.write("Cleaned 'Cancellation' Column:")
111
+ st.write(df[['Cancellation']].head())
112
+
113
+ # Cleaning the 'Price' column
114
+ st.subheader("Cleaning the 'Price' Column:")
115
+ df['Price'] = df['Price'].str.replace(',', '')
116
+ df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
117
+ df = df.dropna(subset=['Price'])
118
+
119
+ st.write("Cleaned 'Price' Column:")
120
+ st.write(df[['Price']].head())
121
+
122
+ # Adding Free Services based on Rating
123
+ st.subheader("Adding Free Services Based on Rating:")
124
+ hotel_ammenities_by_star = {
125
+ 1.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries"],
126
+ 2.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Laundry facilities", "Local calls"],
127
+ 3.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Fitness center access", "Hair dryer"],
128
+ 4.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Welcome drink", "Turndown service", "Minibar (select items)", "Complimentary newspapers", "Shoe shine service", "In-room safe"],
129
+ 5.0: ["Wi-Fi", "Complimentary parking", "Basic toiletries", "Spa facilities (sauna, steam room)", "Complimentary upgrade (subject to availability)", "Personal shopping assistant", "In-room minibar (select items)", "Kids' club and childcare services", "Transportation within city limits", "Unlimited local calls and faxes"]
130
+ }
131
+
132
+ df['Free Services'] = df['Rating'].apply(lambda rating: hotel_ammenities_by_star.get(rating, []))
133
+
134
+ st.write("Added Free Services Based on Rating:")
135
+ st.write(df[['Free Services']].head())
136
+
137
+ # Store the cleaned data in session state
138
+ st.session_state.cleaned_data = df
139
+
140
+ # Display cleaned data
141
+ st.subheader("Cleaned Data Preview:")
142
+ st.dataframe(df.head())
143
+
144
+ # Save cleaned data to CSV and allow the user to download it
145
+ st.subheader("Download the Cleaned Data")
146
+ st.download_button(
147
+ label="Download CSV",
148
+ data=df.to_csv(index=False),
149
+ file_name="Cleaned_Hotel_Data.csv",
150
+ mime="text/csv"
151
+ )
152
+
153
  else:
154
  st.warning("No dataset found. Please upload a dataset on the Home page.")
155
 
 
156
  # Define the URL of the background image (use your own image URL)
157
  # Apply custom CSS for the background image and overlay
158
  background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/675fab3a2d0851e23d23cad3/MI0hTKaf1a2EmxUfA6TsV.png"
 
194
  """,
195
  unsafe_allow_html=True
196
  )
197
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  # Navigation Buttons
199
  st.markdown(
200
  """
 
205
  """,
206
  unsafe_allow_html=True,
207
  )