trohith89 commited on
Commit
12c2ae4
·
verified ·
1 Parent(s): feece51

Rename pages/EDA.py to pages/Data_CLeaning_and_Preprocessing.py

Browse files
pages/Data_CLeaning_and_Preprocessing.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+
6
+ # Configure the Streamlit app
7
+ st.title("Exploratory Data Analysis (EDA) App")
8
+ st.markdown("""
9
+ By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making.
10
+ """)
11
+
12
+ # File uploader for dataset
13
+ uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
14
+
15
+ if uploaded_file is not None:
16
+ # Read and display the dataset
17
+ data = pd.read_csv(uploaded_file)
18
+ st.write("### Uploaded Dataset:")
19
+ st.dataframe(data)
20
+
21
+ # Overview of the dataset
22
+ st.write("### Dataset Overview:")
23
+ st.write(data.describe())
24
+
25
+ # Missing values in the dataset
26
+ st.write("### Missing Values:")
27
+ st.write(data.isnull().sum())
28
+
29
+ # Duplicate rows in the dataset
30
+ st.write("### Duplicate Rows:")
31
+ st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
32
+
33
+ # Visualizations for numeric columns
34
+ st.write("### Numeric Column Visualizations:")
35
+ st.write("Histograms:")
36
+ fig, ax = plt.subplots()
37
+ data.hist(ax=ax, figsize=(10, 8))
38
+ st.pyplot(fig)
39
+
40
+ st.write("Boxplot:")
41
+ fig, ax = plt.subplots()
42
+ sns.boxplot(data=data, orient='h', ax=ax)
43
+ st.pyplot(fig)
44
+
45
+ # Value counts and bar plot for categorical data
46
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
47
+ if len(categorical_columns) > 0:
48
+ selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
49
+ st.write(f"Value Counts for '{selected_cat_col}':")
50
+ st.write(data[selected_cat_col].value_counts())
51
+
52
+ st.write(f"Bar Plot for '{selected_cat_col}':")
53
+ fig, ax = plt.subplots()
54
+ sns.countplot(x=selected_cat_col, data=data, ax=ax)
55
+ st.pyplot(fig)
56
+ else:
57
+ st.write("No categorical columns available for analysis.")
58
+
59
+ # Correlation matrix
60
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
61
+ if len(numeric_columns) > 1:
62
+ st.write("### Correlation Matrix:")
63
+ st.write(data[numeric_columns].corr())
64
+
65
+ st.write("Heatmap of Correlation Matrix:")
66
+ fig, ax = plt.subplots()
67
+ sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
68
+ st.pyplot(fig)
69
+
70
+ # Clean the data: Handle missing values and duplicates
71
+ st.write("### Cleaned Dataset:")
72
+ cleaned_data = data.drop_duplicates() # Remove duplicate rows
73
+ #cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
74
+ st.dataframe(cleaned_data)
75
+
76
+ # Download button for the cleaned dataset
77
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
78
+ st.download_button(
79
+ label="Download Cleaned Dataset",
80
+ data=cleaned_csv,
81
+ file_name="cleaned_dataset.csv",
82
+ mime="text/csv"
83
+ )
84
+
85
+ st.markdown("""
86
+ This analysis provides a basic understanding of the dataset.
87
+ You can now download the cleaned dataset and proceed with further analysis or modeling.
88
+ """)
89
+ else:
90
+ st.warning("Please upload a dataset to proceed with Simple EDA.")
pages/EDA.py DELETED
@@ -1,163 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
5
-
6
- # Function to generate automatic insights for univariate analysis
7
- def generate_univariate_insights(data, column):
8
- mean_val = data[column].mean()
9
- median_val = data[column].median()
10
- std_val = data[column].std()
11
- min_val = data[column].min()
12
- max_val = data[column].max()
13
-
14
- insights = f"""
15
- - The mean value of '{column}' is {mean_val:.2f}.
16
- - The median value is {median_val:.2f}, indicating the central tendency of the data.
17
- - The standard deviation is {std_val:.2f}, suggesting the spread of the values.
18
- - The minimum value observed is {min_val}, and the maximum value is {max_val}.
19
- """
20
- return insights
21
-
22
- # Function to generate automatic insights for bivariate analysis (scatter plot)
23
- def generate_bivariate_insights(data, col1, col2):
24
- correlation = data[col1].corr(data[col2])
25
-
26
- insights = f"""
27
- - The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
28
- - A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
29
- - A correlation near 0 suggests no linear relationship between the variables.
30
- """
31
- return insights
32
-
33
- # Function to generate automatic insights for multivariate analysis (pairplot)
34
- def generate_multivariate_insights(data, columns):
35
- correlations = data[columns].corr()
36
- insights = f"""
37
- - The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
38
- - The diagonal displays the distributions of each variable.
39
- - Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
40
- """
41
- return insights
42
-
43
- # Introduction to EDA
44
- st.markdown("""
45
- # Exploratory Data Analysis (EDA)
46
- Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
47
- - **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
48
- - **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
49
- - **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.
50
-
51
- EDA helps in:
52
- - Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
53
- - Gaining insights that can inform further analysis or modeling steps.
54
- """)
55
-
56
- # File uploader for dataset
57
- uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
58
-
59
- if uploaded_file is not None:
60
- # Read and display the dataset
61
- data = pd.read_csv(uploaded_file)
62
- st.write("### Uploaded Dataset:")
63
- st.dataframe(data)
64
-
65
- # Dataset Overview
66
- st.write("### Dataset Overview:")
67
- st.write(data.describe())
68
-
69
- # Missing values in the dataset
70
- st.write("### Missing Values:")
71
- st.write(data.isnull().sum())
72
-
73
- # Correlation matrix for numerical columns
74
- st.write("### Correlation Matrix:")
75
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
76
- if len(numeric_columns) > 1:
77
- st.write(data[numeric_columns].corr())
78
-
79
- st.write("Heatmap of Correlation Matrix:")
80
- fig, ax = plt.subplots(figsize=(10, 8))
81
- sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
82
- st.pyplot(fig)
83
-
84
- # Univariate Plots (For a single column)
85
- st.write("### Univariate Analysis: Distribution of Columns")
86
- selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
87
-
88
- # Histogram for univariate distribution
89
- st.write(f"Histogram for '{selected_numeric_column}':")
90
- fig, ax = plt.subplots()
91
- sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
92
- st.pyplot(fig)
93
-
94
- # Display automatic insights for univariate analysis
95
- univariate_insights = generate_univariate_insights(data, selected_numeric_column)
96
- st.write("### Insights:")
97
- st.write(univariate_insights)
98
-
99
- # Boxplot for univariate distribution
100
- st.write(f"Boxplot for '{selected_numeric_column}':")
101
- fig, ax = plt.subplots()
102
- sns.boxplot(x=data[selected_numeric_column], ax=ax)
103
- st.pyplot(fig)
104
-
105
- # Bivariate Plots (For two columns)
106
- st.write("### Bivariate Analysis: Relationships between Two Variables")
107
- selected_bivariate_columns = st.multiselect(
108
- "Select Two Columns for Bivariate Analysis",
109
- options=numeric_columns,
110
- default=numeric_columns[:2]
111
- )
112
-
113
- if len(selected_bivariate_columns) == 2:
114
- st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
115
- fig, ax = plt.subplots()
116
- sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
117
- st.pyplot(fig)
118
-
119
- # Display automatic insights for bivariate analysis
120
- bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
121
- st.write("### Insights:")
122
- st.write(bivariate_insights)
123
-
124
- # Multivariate Plots (For multiple columns)
125
- st.write("### Multivariate Analysis: Relationships between Multiple Variables")
126
- selected_multivariate_columns = st.multiselect(
127
- "Select Columns for Multivariate Analysis",
128
- options=numeric_columns,
129
- default=numeric_columns[:3]
130
- )
131
-
132
- if len(selected_multivariate_columns) > 1:
133
- st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
134
- fig, ax = plt.subplots(figsize=(10, 8))
135
- sns.pairplot(data[selected_multivariate_columns])
136
- st.pyplot(fig)
137
-
138
- # Display automatic insights for multivariate analysis
139
- multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
140
- st.write("### Insights:")
141
- st.write(multivariate_insights)
142
-
143
- # Categorical vs Numeric (boxplots)
144
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
145
- if len(categorical_columns) > 0:
146
- selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
147
-
148
- st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
149
- selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
150
- fig, ax = plt.subplots()
151
- sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
152
- st.pyplot(fig)
153
-
154
- st.write(f"### Insights:")
155
- st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")
156
-
157
- # Download the cleaned dataset if needed
158
- st.markdown("""
159
- This analysis provides a basic understanding of the dataset.
160
- You can now proceed with further analysis or modeling.
161
- """)
162
- else:
163
- st.warning("Please upload a dataset to proceed with EDA.")