Mpavan45 commited on
Commit
c4f9397
·
verified ·
1 Parent(s): 9c761b8

Update pages/Simple EDA.py

Browse files
Files changed (1) hide show
  1. pages/Simple EDA.py +89 -21
pages/Simple EDA.py CHANGED
@@ -1,33 +1,101 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
3
 
4
- # EDA and Feature Engineering Page
5
- st.title("Simple EDA")
6
  st.markdown("""
7
-
8
- By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling..
9
  """)
10
 
11
- # File uploader for dataset
12
- uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- if uploaded_file is not None:
15
- # Read and display the dataset
16
- data = pd.read_csv(uploaded_file)
17
- st.write("### Uploaded Dataset:")
18
- st.dataframe(data)
19
 
20
- # Overview of the dataset
21
- st.write("### Dataset Overview:")
22
- st.write(data.describe())
 
 
 
23
 
24
- # Missing values in the dataset
25
- st.write("### Missing Values:")
26
- st.write(data.isnull().sum())
 
 
 
 
27
 
28
- # Correlation matrix
29
- st.write("### Correlation Matrix:")
30
- st.write(data.corr())
 
 
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  else:
33
- st.warning("Please upload a dataset to proceed with EDA.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
 
6
+ # Configure the Streamlit app
7
+ st.title("Exploratory Data Analysis (EDA) App")
8
  st.markdown("""
9
+ This app allows you to perform basic EDA on your dataset.
10
+ Upload your dataset to explore, clean, and visualize your data interactively.
11
  """)
12
 
13
+ # File upload
14
+ uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type="csv")
15
+
16
+ if uploaded_file:
17
+ # Load dataset
18
+ df = pd.read_csv(uploaded_file)
19
+ st.subheader("Dataset Overview")
20
+ st.write("First 5 Rows of the Dataset:")
21
+ st.write(df.head())
22
+
23
+ # Basic Information
24
+ st.subheader("Basic Information about the Dataset")
25
+ st.write("Structure of the Dataset:")
26
+ buffer = []
27
+ df.info(buf=buffer)
28
+ st.text("".join(buffer))
29
+
30
+ st.write("Summary of Numeric Columns:")
31
+ st.write(df.describe())
32
+
33
+ st.write("Data Types of Each Column:")
34
+ st.write(df.dtypes)
35
+
36
+ # Missing Values
37
+ st.subheader("Missing Values")
38
+ st.write("Number of Missing Values per Column:")
39
+ st.write(df.isnull().sum())
40
+
41
+ # Duplicate Rows
42
+ st.subheader("Duplicate Rows")
43
+ st.write(f"Number of Duplicate Rows: {df.duplicated().sum()}")
44
+
45
+ # Visualize Numeric Columns
46
+ st.subheader("Numeric Column Visualizations")
47
+ st.write("Histograms:")
48
+ fig, ax = plt.subplots(figsize=(10, 8))
49
+ df.hist(ax=ax)
50
+ st.pyplot(fig)
51
+
52
+ st.write("Boxplot:")
53
+ fig, ax = plt.subplots()
54
+ sns.boxplot(data=df, orient='h', ax=ax)
55
+ st.pyplot(fig)
56
+
57
+ # Categorical Column Analysis
58
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
59
+ if len(categorical_columns) > 0:
60
+ st.subheader("Categorical Column Analysis")
61
+ selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
62
 
63
+ st.write(f"Value Counts for {selected_cat_col}:")
64
+ st.write(df[selected_cat_col].value_counts())
 
 
 
65
 
66
+ st.write(f"Bar Plot for {selected_cat_col}:")
67
+ fig, ax = plt.subplots()
68
+ sns.countplot(x=selected_cat_col, data=df, ax=ax)
69
+ st.pyplot(fig)
70
+ else:
71
+ st.write("No categorical columns available in the dataset.")
72
 
73
+ # Correlation Matrix
74
+ numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
75
+ if len(numeric_columns) > 1:
76
+ st.subheader("Correlation Analysis")
77
+ st.write("Correlation Matrix:")
78
+ correlation_matrix = df[numeric_columns].corr()
79
+ st.write(correlation_matrix)
80
 
81
+ st.write("Heatmap of Correlation Matrix:")
82
+ fig, ax = plt.subplots()
83
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
84
+ st.pyplot(fig)
85
+ else:
86
+ st.write("Not enough numeric columns for correlation analysis.")
87
 
88
+ # Save Cleaned Data
89
+ st.subheader("Save Cleaned Dataset")
90
+ if st.button("Save Dataset (after removing duplicates)"):
91
+ cleaned_df = df.drop_duplicates()
92
+ cleaned_csv = cleaned_df.to_csv(index=False).encode('utf-8')
93
+ st.download_button(
94
+ label="Download Cleaned Dataset",
95
+ data=cleaned_csv,
96
+ file_name="cleaned_dataset.csv",
97
+ mime="text/csv"
98
+ )
99
+ st.success("Cleaned dataset is ready for download!")
100
  else:
101
+ st.info("Please upload a CSV file to get started.")