Mpavan45 commited on
Commit
ad49f99
·
verified ·
1 Parent(s): 9a7d598

Update pages/2_Data Cleaning and Processing .py

Browse files
pages/2_Data Cleaning and Processing .py CHANGED
@@ -17,7 +17,6 @@ if df is not None:
17
  st.subheader("Dataset Preview:")
18
  st.write(df.head()) # Display the first 5 rows
19
 
20
- st.subheader("Info of the Dataset:")
21
  # Redirect the output of df.info() to a string buffer
22
  buffer = StringIO()
23
  df.info(buf=buffer)
@@ -39,6 +38,13 @@ if df is not None:
39
 
40
  st.write("Cleaned 'Hotel Name' Column:")
41
  st.write(df[['Hotel Name']].head())
 
 
 
 
 
 
 
42
 
43
  # Cleaning the 'Rating' column
44
  st.subheader("Cleaning the 'Rating' Column:")
@@ -53,6 +59,12 @@ if df is not None:
53
 
54
  st.write("Cleaned 'Rating' Column:")
55
  st.write(df[['Rating']].head())
 
 
 
 
 
 
56
 
57
  # Cleaning the 'Location' column
58
  st.subheader("Cleaning the 'Location' Column:")
@@ -63,7 +75,11 @@ if df is not None:
63
  st.write("Cleaned 'Location' Column:")
64
  st.write(df[['Location']].head())
65
 
66
-
 
 
 
 
67
  # Cleaning the 'Discount' column
68
  st.subheader("Cleaning the 'Discount' Column:")
69
  def f(x):
@@ -76,6 +92,13 @@ if df is not None:
76
 
77
  st.write("Cleaned 'Discount' Column:")
78
  st.write(df[['Discount']].head())
 
 
 
 
 
 
 
79
 
80
  # Cleaning the 'Review Text' column
81
  st.subheader("Cleaning the 'Review Text' Column:")
@@ -89,6 +112,12 @@ if df is not None:
89
 
90
  st.write("Cleaned 'Review Text' Column:")
91
  st.write(df[['Review Text']].head())
 
 
 
 
 
 
92
 
93
  # Cleaning the 'Reviews' column
94
  st.subheader("Cleaning the 'Reviews' Column:")
@@ -97,26 +126,44 @@ if df is not None:
97
  st.write("Cleaned 'Reviews' Column:")
98
  st.write(df[['Reviews']].head())
99
 
 
 
 
 
 
 
 
100
  # Cleaning the 'Cashback' column
101
  st.subheader("Cleaning the 'Cashback' Column:")
102
  df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
103
 
104
  st.write("Cleaned 'Cashback' Column:")
105
  st.write(df[['Cashback']].head())
106
-
 
 
 
107
  # Cleaning the 'Cancellation' column
108
  st.subheader("Cleaning the 'Cancellation' Column:")
109
  df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
110
 
111
  st.write("Cleaned 'Cancellation' Column:")
112
  st.write(df[['Cancellation']].head())
113
-
 
 
 
114
  # Cleaning the 'Price' column
115
  st.subheader("Cleaning the 'Price' Column:")
116
  df['Price'] = df['Price'].str.replace(',', '')
117
  df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
118
  df = df.dropna(subset=['Price'])
119
-
 
 
 
 
 
120
  st.write("Cleaned 'Price' Column:")
121
  st.write(df[['Price']].head())
122
 
@@ -134,14 +181,36 @@ if df is not None:
134
 
135
  st.write("Added Free Services Based on Rating:")
136
  st.write(df[['Free Services']].head())
 
 
 
 
 
137
 
138
  # Store the cleaned data in session state
139
  st.session_state.cleaned_data = df
140
-
 
 
 
141
  # Display cleaned data
142
  st.subheader("Cleaned Data Preview:")
143
- st.dataframe(df.head())
 
 
 
 
 
 
144
 
 
 
 
 
 
 
 
 
145
  # Save cleaned data to CSV and allow the user to download it
146
  st.subheader("Download the Cleaned Data")
147
  st.download_button(
@@ -151,6 +220,7 @@ if df is not None:
151
  mime="text/csv"
152
  )
153
 
 
154
  else:
155
  st.warning("No dataset found. Please upload a dataset on the Home page.")
156
 
 
17
  st.subheader("Dataset Preview:")
18
  st.write(df.head()) # Display the first 5 rows
19
 
 
20
  # Redirect the output of df.info() to a string buffer
21
  buffer = StringIO()
22
  df.info(buf=buffer)
 
38
 
39
  st.write("Cleaned 'Hotel Name' Column:")
40
  st.write(df[['Hotel Name']].head())
41
+ st.write("""
42
+ ### 1. **Cleaning the 'Hotel Name' Column**
43
+ This section cleans the 'Hotel Name' column by:
44
+ - Removing missing values.
45
+ - Removing unwanted text such as newline characters, "View on map", and years.
46
+ - Dropping rows with irrelevant hotel names (like "2021", "2022", "2023").
47
+ """)
48
 
49
  # Cleaning the 'Rating' column
50
  st.subheader("Cleaning the 'Rating' Column:")
 
59
 
60
  st.write("Cleaned 'Rating' Column:")
61
  st.write(df[['Rating']].head())
62
+ st.write("""
63
+ ### 2. **Cleaning the 'Rating' Column**
64
+ This part deals with cleaning the 'Rating' column:
65
+ - Removing missing values.
66
+ - Extracting numerical values from string ratings using regular expressions.
67
+ """)
68
 
69
  # Cleaning the 'Location' column
70
  st.subheader("Cleaning the 'Location' Column:")
 
75
  st.write("Cleaned 'Location' Column:")
76
  st.write(df[['Location']].head())
77
 
78
+ st.write("""
79
+ ### 3. **Cleaning the 'Location' Column**
80
+ - Missing values in the 'Location' column are dropped.
81
+ - The location is cleaned by extracting only words and formatting them consistently.
82
+ """)
83
  # Cleaning the 'Discount' column
84
  st.subheader("Cleaning the 'Discount' Column:")
85
  def f(x):
 
92
 
93
  st.write("Cleaned 'Discount' Column:")
94
  st.write(df[['Discount']].head())
95
+ st.write("""
96
+ ### 4. **Cleaning the 'Discount' Column**
97
+ This section:
98
+ - Extracts discount values from strings.
99
+ - Replaces missing or invalid discounts with '0'.
100
+ - Adjusts discounts greater than 50 by reducing them by 50.
101
+ """)
102
 
103
  # Cleaning the 'Review Text' column
104
  st.subheader("Cleaning the 'Review Text' Column:")
 
112
 
113
  st.write("Cleaned 'Review Text' Column:")
114
  st.write(df[['Review Text']].head())
115
+ st.write("""
116
+ ### 5. **Cleaning the 'Review Text' Column**
117
+ - The 'Review Text' is truncated to only include the first sentence.
118
+ - Ratings are mapped to corresponding descriptive review text.
119
+ - Invalid reviews are replaced based on conditions like missing or incomplete data.
120
+ """)
121
 
122
  # Cleaning the 'Reviews' column
123
  st.subheader("Cleaning the 'Reviews' Column:")
 
126
  st.write("Cleaned 'Reviews' Column:")
127
  st.write(df[['Reviews']].head())
128
 
129
+ st.write("""
130
+ ### 6. **Cleaning the 'Reviews' Column**
131
+ This part ensures that:
132
+ - Missing or invalid review values in the 'Reviews' column are replaced with '0'.
133
+ """)
134
+
135
+
136
  # Cleaning the 'Cashback' column
137
  st.subheader("Cleaning the 'Cashback' Column:")
138
  df['Cashback'] = df.apply(lambda row: '0' if (str(row['Cashback']) in ['Nan', 'np.nan', 'nan', ''] or pd.isnull(row['Cashback'])) else row['Cashback'], axis=1)
139
 
140
  st.write("Cleaned 'Cashback' Column:")
141
  st.write(df[['Cashback']].head())
142
+ st.write("""
143
+ ### 7. **Cleaning the 'Cashback' Column**
144
+ - Missing or invalid cashback values are replaced with '0' for consistency.
145
+ """)
146
  # Cleaning the 'Cancellation' column
147
  st.subheader("Cleaning the 'Cancellation' Column:")
148
  df['Cancellation'] = df['Cancellation'].replace('#NAME?', 'No')
149
 
150
  st.write("Cleaned 'Cancellation' Column:")
151
  st.write(df[['Cancellation']].head())
152
+ st.write("""
153
+ ### 8. **Cleaning the 'Cancellation' Column**
154
+ - Unwanted values (like '#NAME?') in the 'Cancellation' column are replaced with 'No'.
155
+ """)
156
  # Cleaning the 'Price' column
157
  st.subheader("Cleaning the 'Price' Column:")
158
  df['Price'] = df['Price'].str.replace(',', '')
159
  df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
160
  df = df.dropna(subset=['Price'])
161
+ st.write("""
162
+ ### 9. **Cleaning the 'Price' Column**
163
+ - Commas in the 'Price' column are removed.
164
+ - Values are converted to numeric types, with errors coerced into NaN.
165
+ - Rows with missing or invalid prices are removed.
166
+ """)
167
  st.write("Cleaned 'Price' Column:")
168
  st.write(df[['Price']].head())
169
 
 
181
 
182
  st.write("Added Free Services Based on Rating:")
183
  st.write(df[['Free Services']].head())
184
+ st.write("""
185
+ ### 10. **Adding Free Services Based on Rating**
186
+ - A dictionary `hotel_ammenities_by_star` is used to define the free services based on the hotel’s rating.
187
+ - These services are added to the DataFrame as a new column 'Free Services'.
188
+ """)
189
 
190
  # Store the cleaned data in session state
191
  st.session_state.cleaned_data = df
192
+ st.write("""
193
+ ### 11. **Store Cleaned Data in Session State**
194
+ The cleaned DataFrame is stored in Streamlit’s session state, allowing it to persist across pages or interactions.
195
+ """)
196
  # Display cleaned data
197
  st.subheader("Cleaned Data Preview:")
198
+ st.dataframe(df)
199
+ st.write("""
200
+ ### 12. **Dataset Information**
201
+ Displays basic information about the DataFrame such as column data types, non-null counts, and memory usage.
202
+ """)
203
+ buffer = StringIO()
204
+ df.info(buf=buffer)
205
 
206
+ # Display the content in Streamlit as Markdown
207
+ st.subheader("Info of the Dataset:")
208
+ st.markdown(f"```{buffer.getvalue()}```")
209
+
210
+ st.write("""
211
+ ### 13. **Displaying Cleaned Data**
212
+ The cleaned dataset is displayed as a preview by showing the first few rows of the DataFrame.
213
+ """)
214
  # Save cleaned data to CSV and allow the user to download it
215
  st.subheader("Download the Cleaned Data")
216
  st.download_button(
 
220
  mime="text/csv"
221
  )
222
 
223
+
224
  else:
225
  st.warning("No dataset found. Please upload a dataset on the Home page.")
226