Mpavan45 commited on
Commit
7b7ee83
·
verified ·
1 Parent(s): bc48345

Rename pages/EDA and Feature Engineering.py to pages/EDA .py

Browse files
Files changed (2) hide show
  1. pages/EDA .py +163 -0
  2. pages/EDA and Feature Engineering.py +0 -40
pages/EDA .py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+
6
+ # Function to generate automatic insights for univariate analysis
7
+ def generate_univariate_insights(data, column):
8
+ mean_val = data[column].mean()
9
+ median_val = data[column].median()
10
+ std_val = data[column].std()
11
+ min_val = data[column].min()
12
+ max_val = data[column].max()
13
+
14
+ insights = f"""
15
+ - The mean value of '{column}' is {mean_val:.2f}.
16
+ - The median value is {median_val:.2f}, indicating the central tendency of the data.
17
+ - The standard deviation is {std_val:.2f}, suggesting the spread of the values.
18
+ - The minimum value observed is {min_val}, and the maximum value is {max_val}.
19
+ """
20
+ return insights
21
+
22
+ # Function to generate automatic insights for bivariate analysis (scatter plot)
23
+ def generate_bivariate_insights(data, col1, col2):
24
+ correlation = data[col1].corr(data[col2])
25
+
26
+ insights = f"""
27
+ - The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
28
+ - A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
29
+ - A correlation near 0 suggests no linear relationship between the variables.
30
+ """
31
+ return insights
32
+
33
+ # Function to generate automatic insights for multivariate analysis (pairplot)
34
+ def generate_multivariate_insights(data, columns):
35
+ correlations = data[columns].corr()
36
+ insights = f"""
37
+ - The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
38
+ - The diagonal displays the distributions of each variable.
39
+ - Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
40
+ """
41
+ return insights
42
+
43
+ # Introduction to EDA
44
+ st.markdown("""
45
+ # Exploratory Data Analysis (EDA)
46
+ Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
47
+ - **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
48
+ - **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
49
+ - **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.
50
+
51
+ EDA helps in:
52
+ - Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
53
+ - Gaining insights that can inform further analysis or modeling steps.
54
+ """)
55
+
56
+ # File uploader for dataset
57
+ uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
58
+
59
+ if uploaded_file is not None:
60
+ # Read and display the dataset
61
+ data = pd.read_csv(uploaded_file)
62
+ st.write("### Uploaded Dataset:")
63
+ st.dataframe(data)
64
+
65
+ # Dataset Overview
66
+ st.write("### Dataset Overview:")
67
+ st.write(data.describe())
68
+
69
+ # Missing values in the dataset
70
+ st.write("### Missing Values:")
71
+ st.write(data.isnull().sum())
72
+
73
+ # Correlation matrix for numerical columns
74
+ st.write("### Correlation Matrix:")
75
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
76
+ if len(numeric_columns) > 1:
77
+ st.write(data[numeric_columns].corr())
78
+
79
+ st.write("Heatmap of Correlation Matrix:")
80
+ fig, ax = plt.subplots(figsize=(10, 8))
81
+ sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
82
+ st.pyplot(fig)
83
+
84
+ # Univariate Plots (For a single column)
85
+ st.write("### Univariate Analysis: Distribution of Columns")
86
+ selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
87
+
88
+ # Histogram for univariate distribution
89
+ st.write(f"Histogram for '{selected_numeric_column}':")
90
+ fig, ax = plt.subplots()
91
+ sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
92
+ st.pyplot(fig)
93
+
94
+ # Display automatic insights for univariate analysis
95
+ univariate_insights = generate_univariate_insights(data, selected_numeric_column)
96
+ st.write("### Insights:")
97
+ st.write(univariate_insights)
98
+
99
+ # Boxplot for univariate distribution
100
+ st.write(f"Boxplot for '{selected_numeric_column}':")
101
+ fig, ax = plt.subplots()
102
+ sns.boxplot(x=data[selected_numeric_column], ax=ax)
103
+ st.pyplot(fig)
104
+
105
+ # Bivariate Plots (For two columns)
106
+ st.write("### Bivariate Analysis: Relationships between Two Variables")
107
+ selected_bivariate_columns = st.multiselect(
108
+ "Select Two Columns for Bivariate Analysis",
109
+ options=numeric_columns,
110
+ default=numeric_columns[:2]
111
+ )
112
+
113
+ if len(selected_bivariate_columns) == 2:
114
+ st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
115
+ fig, ax = plt.subplots()
116
+ sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
117
+ st.pyplot(fig)
118
+
119
+ # Display automatic insights for bivariate analysis
120
+ bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
121
+ st.write("### Insights:")
122
+ st.write(bivariate_insights)
123
+
124
+ # Multivariate Plots (For multiple columns)
125
+ st.write("### Multivariate Analysis: Relationships between Multiple Variables")
126
+ selected_multivariate_columns = st.multiselect(
127
+ "Select Columns for Multivariate Analysis",
128
+ options=numeric_columns,
129
+ default=numeric_columns[:3]
130
+ )
131
+
132
+ if len(selected_multivariate_columns) > 1:
133
+ st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
134
+ fig, ax = plt.subplots(figsize=(10, 8))
135
+ sns.pairplot(data[selected_multivariate_columns])
136
+ st.pyplot(fig)
137
+
138
+ # Display automatic insights for multivariate analysis
139
+ multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
140
+ st.write("### Insights:")
141
+ st.write(multivariate_insights)
142
+
143
+ # Categorical vs Numeric (boxplots)
144
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
145
+ if len(categorical_columns) > 0:
146
+ selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
147
+
148
+ st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
149
+ selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
150
+ fig, ax = plt.subplots()
151
+ sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
152
+ st.pyplot(fig)
153
+
154
+ st.write(f"### Insights:")
155
+ st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")
156
+
157
+ # Download the cleaned dataset if needed
158
+ st.markdown("""
159
+ This analysis provides a basic understanding of the dataset.
160
+ You can now proceed with further analysis or modeling.
161
+ """)
162
+ else:
163
+ st.warning("Please upload a dataset to proceed with EDA.")
pages/EDA and Feature Engineering.py DELETED
@@ -1,40 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
-
4
- # EDA and Feature Engineering Page
5
- st.title("EDA and Feature Engineering")
6
- st.markdown("""
7
- This section is dedicated to exploratory data analysis (EDA) and feature engineering.
8
- You can upload your dataset and analyze it here.
9
- """)
10
-
11
- # File uploader for dataset
12
- uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
13
-
14
- if uploaded_file is not None:
15
- # Read and display the dataset
16
- data = pd.read_csv(uploaded_file)
17
- st.write("### Uploaded Dataset:")
18
- st.dataframe(data)
19
-
20
- # Overview of the dataset
21
- st.write("### Dataset Overview:")
22
- st.write(data.describe())
23
-
24
- # Missing values in the dataset
25
- st.write("### Missing Values:")
26
- st.write(data.isnull().sum())
27
-
28
- # Correlation matrix
29
- st.write("### Correlation Matrix:")
30
- st.write(data.corr())
31
-
32
- st.markdown("""
33
- Based on the insights from this analysis, you can proceed to perform feature engineering by:
34
- - Handling missing values.
35
- - Creating or transforming features.
36
- - Encoding categorical variables.
37
- - Normalizing or scaling numerical features.
38
- """)
39
- else:
40
- st.warning("Please upload a dataset to proceed with EDA.")