trohith89 commited on
Commit
3d738d0
·
verified ·
1 Parent(s): 548cf58

Update pages/3_EDA_and_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/3_EDA_and_Feature_Engineering.py +100 -13
pages/3_EDA_and_Feature_Engineering.py CHANGED
@@ -1,20 +1,106 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import seaborn as sns
4
  import matplotlib.pyplot as plt
5
- import plotly.express as px
6
- from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Page Title
9
- st.title("Complete EDA and Feature Engineering")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- st.markdown("""
12
- This page provides advanced Exploratory Data Analysis (EDA) and Feature Engineering using the dataset loaded in memory.
13
- ---
 
 
 
 
 
 
 
 
 
14
  """)
15
 
16
- # Check if dataset exists in session state
17
- if 'df' in st.session_state:
 
 
 
 
 
18
  df = st.session_state['df']
19
  st.success("Dataset loaded successfully.")
20
 
@@ -453,4 +539,5 @@ if 'df' in st.session_state:
453
  - 1: Perfect positive correlation (as one variable increases, the other increases)''')
454
 
455
  else:
456
- st.error("No dataset found. Please upload a dataset on the main page first.")
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
  import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from io import StringIO
7
+ import sys
8
+
9
+ st.markdown("<h1 style='text-align:center; color:white;'>EDA and Feature Engineering</h1>",unsafe_allow_html=True)
10
+
11
+ # Define the URL of the background image (use your own image URL)
12
+ background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/7ZCmkouk1pS37_kREZmYJ.jpeg"
13
+
14
+ # Apply custom CSS for the background image and overlay
15
+ st.markdown(
16
+ f"""
17
+ <style>
18
+ .stApp {{
19
+ background-image: url("{background_image_url}");
20
+ background-size: cover; /* Ensures the image covers the full screen */
21
+ background-position: center; /* Centers the background image */
22
+ background-attachment: fixed; /* Keeps the background fixed as you scroll */
23
+ height: 100vh;
24
+ width: 100%;
25
+ overflow: hidden; /* Prevents any overflow that might cause the background image to zoom */
26
+ }}
27
+
28
+ /* Semi-transparent overlay */
29
+ .stApp::before {{
30
+ content: "";
31
+ position: absolute;
32
+ top: 0;
33
+ left: 0;
34
+ width: 100%;
35
+ height: 100%;
36
+ background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
37
+ z-index: -1;
38
+ }}
39
+ /* Styling the content to ensure text visibility */
40
+ .stMarkdown {{
41
+ color: white; /* White text to ensure visibility */
42
+ font-size: 100px; /* Adjust font size for better readability */
43
+ }}
44
+ </style>
45
+ """,
46
+ unsafe_allow_html=True
47
+ )
48
+ # Title of the Streamlit app
49
+ st.title("Exploratory Data Analysis (EDA) on Agoda Hotel Dataset")
50
+
51
+ # Introduction and Aim
52
+ st.header("Aim of the EDA")
53
+ st.write("""
54
+ The main objective of this EDA is to analyze Agoda's hotel dataset to identify key factors influencing hotel pricing strategies and customer booking preferences.
55
+ The analysis will focus on uncovering patterns, trends, and relationships in hotel ratings, pricing structures, discounts, and free services.
56
+ By leveraging these insights, Agoda can optimize its pricing strategy, predict booking preferences, and enhance revenue generation while maintaining customer satisfaction.
57
+ """)
58
 
59
+ # Description of the Data
60
+ st.header("Description of the Data")
61
+ st.write("""
62
+ **Overall Summary:** We are analyzing the Agoda dataset by performing EDA and Statistical Tests on the data that has already been cleaned through data wrangling to address any messiness or missing information.
63
+
64
+ **Table - Agoda_df:** The cleaned dataset consists of over 3,500 hotel listings, which will be used as test subjects for the hotel pricing period.
65
+ **Dataset Details:**
66
+ The dataset contains information about 3,219 hotel room listings with 12 features, each detailing aspects of the listing. Below is the description of each column:
67
+ | Column Name | Description |
68
+ |-----------------|---------------------------------------------------------------------------|
69
+ | hotel_name | Name of the hotel. |
70
+ | rating | Average customer rating of the hotel (float, range 1-5). |
71
+ | location | Address or locality of the hotel. |
72
+ | review_text | Customer feedback or comments about the hotel. |
73
+ | reviews | Total number of customer reviews for the hotel. |
74
+ | cashback | Cashback amount offered for the booking. |
75
+ | discount | Discount percentage applied to the room price. |
76
+ | free_services | Free services provided (e.g., breakfast, Wi-Fi). |
77
+ | cancellation | Cancellation policy for the booking (e.g., free, non-refundable). |
78
+ | price | Price of the room after discounts and cashback (float). |
79
+ | state | The state where the hotel is located. |
80
+ | category | Target variable representing the room type or category (e.g., budget, luxury). |
81
+ """)
82
 
83
+ # Table-wise EDA & Necessary Tests
84
+ st.header("Table-wise EDA and Necessary Statistical Tests")
85
+ st.write("""
86
+ **Agoda_df:** Cleaned dataset with hotel details and key features like ratings, price, reviews, cashback, discounts, and free services.
87
+ The EDA will involve the following steps:
88
+ - **Summary Statistics:** Analyze the central tendency, spread, and shape of the distribution of each feature.
89
+ - **Data Distribution:** Visualize the distribution of key features like price, ratings, reviews, cashback, etc.
90
+ - **Correlation Analysis:** Analyze relationships between numeric features like price, ratings, reviews, cashback, etc.
91
+ - **Categorical Data Analysis:** Explore categorical variables like hotel category, cancellation policy, state, and location using frequency tables and visualizations.
92
+ - **Missing Value Analysis:** Ensure no missing values remain, and check the need for imputations.
93
+ - **Outlier Detection:** Identify any outliers that may skew the analysis or predictions.
94
+ - **Statistical Tests:** Apply appropriate statistical tests to identify significant differences or relationships (e.g., t-tests for comparing means, chi-squared for categorical variables).
95
  """)
96
 
97
+ # Placeholder for further detailed code or visualizations
98
+ st.write("Further steps will include generating visualizations and statistical tests to explore relationships between features in more detail.")
99
+
100
+ # Access dataset from session state
101
+ data= st.session_state.get("dataset")
102
+
103
+ if data is not None:
104
  df = st.session_state['df']
105
  st.success("Dataset loaded successfully.")
106
 
 
539
  - 1: Perfect positive correlation (as one variable increases, the other increases)''')
540
 
541
  else:
542
+ st.warning("No dataset found in session state. Please load the dataset into `st.session_state['data']`.")
543
+