Mpavan45 commited on
Commit
29d1685
·
verified ·
1 Parent(s): 169d966

Update pages/Simple EDA.py

Browse files
Files changed (1) hide show
  1. pages/Simple EDA.py +61 -71
pages/Simple EDA.py CHANGED
@@ -6,96 +6,86 @@ import matplotlib.pyplot as plt
6
  # Configure the Streamlit app
7
  st.title("Exploratory Data Analysis (EDA) App")
8
  st.markdown("""
9
- This app allows you to perform basic EDA on your dataset.
10
- Upload your dataset to explore, clean, and visualize your data interactively.
11
  """)
12
 
13
- # File upload
14
- uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type="csv")
15
-
16
- if uploaded_file:
17
- # Load dataset
18
- df = pd.read_csv(uploaded_file)
19
- st.subheader("Dataset Overview")
20
- st.write("First 5 Rows of the Dataset:")
21
- st.write(df.head())
22
-
23
- # Basic Information
24
- st.subheader("Basic Information about the Dataset")
25
- st.write("Structure of the Dataset:")
26
- buffer = []
27
- df.info(buf=buffer)
28
- st.text("".join(buffer))
29
-
30
- st.write("Summary of Numeric Columns:")
31
- st.write(df.describe())
32
-
33
- st.write("Data Types of Each Column:")
34
- st.write(df.dtypes)
35
-
36
- # Missing Values
37
- st.subheader("Missing Values")
38
- st.write("Number of Missing Values per Column:")
39
- st.write(df.isnull().sum())
40
-
41
- # Duplicate Rows
42
- st.subheader("Duplicate Rows")
43
- st.write(f"Number of Duplicate Rows: {df.duplicated().sum()}")
44
-
45
- # Visualize Numeric Columns
46
- st.subheader("Numeric Column Visualizations")
47
  st.write("Histograms:")
48
- fig, ax = plt.subplots(figsize=(10, 8))
49
- df.hist(ax=ax)
50
  st.pyplot(fig)
51
 
52
  st.write("Boxplot:")
53
  fig, ax = plt.subplots()
54
- sns.boxplot(data=df, orient='h', ax=ax)
55
  st.pyplot(fig)
56
 
57
- # Categorical Column Analysis
58
- categorical_columns = df.select_dtypes(include=['object', 'category']).columns
59
  if len(categorical_columns) > 0:
60
- st.subheader("Categorical Column Analysis")
61
  selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
 
 
62
 
63
- st.write(f"Value Counts for {selected_cat_col}:")
64
- st.write(df[selected_cat_col].value_counts())
65
-
66
- st.write(f"Bar Plot for {selected_cat_col}:")
67
  fig, ax = plt.subplots()
68
- sns.countplot(x=selected_cat_col, data=df, ax=ax)
69
  st.pyplot(fig)
70
  else:
71
- st.write("No categorical columns available in the dataset.")
72
 
73
- # Correlation Matrix
74
- numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
75
  if len(numeric_columns) > 1:
76
- st.subheader("Correlation Analysis")
77
- st.write("Correlation Matrix:")
78
- correlation_matrix = df[numeric_columns].corr()
79
- st.write(correlation_matrix)
80
 
81
  st.write("Heatmap of Correlation Matrix:")
82
  fig, ax = plt.subplots()
83
- sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
84
  st.pyplot(fig)
85
- else:
86
- st.write("Not enough numeric columns for correlation analysis.")
87
-
88
- # Save Cleaned Data
89
- st.subheader("Save Cleaned Dataset")
90
- if st.button("Save Dataset (after removing duplicates)"):
91
- cleaned_df = df.drop_duplicates()
92
- cleaned_csv = cleaned_df.to_csv(index=False).encode('utf-8')
93
- st.download_button(
94
- label="Download Cleaned Dataset",
95
- data=cleaned_csv,
96
- file_name="cleaned_dataset.csv",
97
- mime="text/csv"
98
- )
99
- st.success("Cleaned dataset is ready for download!")
 
 
 
 
 
100
  else:
101
- st.info("Please upload a CSV file to get started.")
 
6
  # Configure the Streamlit app
7
  st.title("Exploratory Data Analysis (EDA) App")
8
  st.markdown("""
9
+ By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making.
10
+
11
  """)
12
 
13
+ # File uploader for dataset
14
+ uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
15
+
16
+ if uploaded_file is not None:
17
+ # Read and display the dataset
18
+ data = pd.read_csv(uploaded_file)
19
+ st.write("### Uploaded Dataset:")
20
+ st.dataframe(data)
21
+
22
+ # Overview of the dataset
23
+ st.write("### Dataset Overview:")
24
+ st.write(data.describe())
25
+
26
+ # Missing values in the dataset
27
+ st.write("### Missing Values:")
28
+ st.write(data.isnull().sum())
29
+
30
+ # Duplicate rows in the dataset
31
+ st.write("### Duplicate Rows:")
32
+ st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
33
+
34
+ # Visualizations for numeric columns
35
+ st.write("### Numeric Column Visualizations:")
 
 
 
 
 
 
 
 
 
 
 
36
  st.write("Histograms:")
37
+ fig, ax = plt.subplots()
38
+ data.hist(ax=ax, figsize=(10, 8))
39
  st.pyplot(fig)
40
 
41
  st.write("Boxplot:")
42
  fig, ax = plt.subplots()
43
+ sns.boxplot(data=data, orient='h', ax=ax)
44
  st.pyplot(fig)
45
 
46
+ # Value counts and bar plot for categorical data
47
+ categorical_columns = data.select_dtypes(include=['object', 'category']).columns
48
  if len(categorical_columns) > 0:
 
49
  selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
50
+ st.write(f"Value Counts for '{selected_cat_col}':")
51
+ st.write(data[selected_cat_col].value_counts())
52
 
53
+ st.write(f"Bar Plot for '{selected_cat_col}':")
 
 
 
54
  fig, ax = plt.subplots()
55
+ sns.countplot(x=selected_cat_col, data=data, ax=ax)
56
  st.pyplot(fig)
57
  else:
58
+ st.write("No categorical columns available for analysis.")
59
 
60
+ # Correlation matrix
61
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
62
  if len(numeric_columns) > 1:
63
+ st.write("### Correlation Matrix:")
64
+ st.write(data[numeric_columns].corr())
 
 
65
 
66
  st.write("Heatmap of Correlation Matrix:")
67
  fig, ax = plt.subplots()
68
+ sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
69
  st.pyplot(fig)
70
+
71
+ # Clean the data: Handle missing values and duplicates
72
+ st.write("### Cleaned Dataset:")
73
+ cleaned_data = data.drop_duplicates() # Remove duplicate rows
74
+ cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
75
+ st.dataframe(cleaned_data)
76
+
77
+ # Download button for the cleaned dataset
78
+ cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
79
+ st.download_button(
80
+ label="Download Cleaned Dataset",
81
+ data=cleaned_csv,
82
+ file_name="cleaned_dataset.csv",
83
+ mime="text/csv"
84
+ )
85
+
86
+ st.markdown("""
87
+ This analysis provides a basic understanding of the dataset.
88
+ You can now download the cleaned dataset and proceed with further analysis or modeling.
89
+ """)
90
  else:
91
+ st.warning("Please upload a dataset to proceed with EDA.")