trohith89 commited on
Commit
7bc16d5
·
verified ·
1 Parent(s): 12c2ae4

Update pages/Data_CLeaning_and_Preprocessing.py

Browse files
pages/Data_CLeaning_and_Preprocessing.py CHANGED
@@ -32,15 +32,23 @@ if uploaded_file is not None:
32
 
33
  # Visualizations for numeric columns
34
  st.write("### Numeric Column Visualizations:")
 
 
35
  st.write("Histograms:")
36
- fig, ax = plt.subplots()
37
- data.hist(ax=ax, figsize=(10, 8))
38
- st.pyplot(fig)
 
 
 
39
 
40
- st.write("Boxplot:")
41
- fig, ax = plt.subplots()
42
- sns.boxplot(data=data, orient='h', ax=ax)
43
- st.pyplot(fig)
 
 
 
44
 
45
  # Value counts and bar plot for categorical data
46
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
@@ -52,25 +60,25 @@ if uploaded_file is not None:
52
  st.write(f"Bar Plot for '{selected_cat_col}':")
53
  fig, ax = plt.subplots()
54
  sns.countplot(x=selected_cat_col, data=data, ax=ax)
 
55
  st.pyplot(fig)
56
  else:
57
  st.write("No categorical columns available for analysis.")
58
 
59
  # Correlation matrix
60
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
61
  if len(numeric_columns) > 1:
62
  st.write("### Correlation Matrix:")
63
- st.write(data[numeric_columns].corr())
 
64
 
65
  st.write("Heatmap of Correlation Matrix:")
66
- fig, ax = plt.subplots()
67
- sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
68
  st.pyplot(fig)
69
 
70
  # Clean the data: Handle missing values and duplicates
71
  st.write("### Cleaned Dataset:")
72
  cleaned_data = data.drop_duplicates() # Remove duplicate rows
73
- #cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
74
  st.dataframe(cleaned_data)
75
 
76
  # Download button for the cleaned dataset
@@ -87,4 +95,4 @@ if uploaded_file is not None:
87
  You can now download the cleaned dataset and proceed with further analysis or modeling.
88
  """)
89
  else:
90
- st.warning("Please upload a dataset to proceed with Simple EDA.")
 
32
 
33
  # Visualizations for numeric columns
34
  st.write("### Numeric Column Visualizations:")
35
+
36
+ # Histograms for numeric data
37
  st.write("Histograms:")
38
+ numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
39
+ for col in numeric_columns:
40
+ fig, ax = plt.subplots()
41
+ sns.histplot(data[col], kde=True, ax=ax)
42
+ ax.set_title(f'Histogram of {col}')
43
+ st.pyplot(fig)
44
 
45
+ # Boxplots for numeric data
46
+ st.write("Boxplots:")
47
+ for col in numeric_columns:
48
+ fig, ax = plt.subplots()
49
+ sns.boxplot(x=data[col], ax=ax)
50
+ ax.set_title(f'Boxplot of {col}')
51
+ st.pyplot(fig)
52
 
53
  # Value counts and bar plot for categorical data
54
  categorical_columns = data.select_dtypes(include=['object', 'category']).columns
 
60
  st.write(f"Bar Plot for '{selected_cat_col}':")
61
  fig, ax = plt.subplots()
62
  sns.countplot(x=selected_cat_col, data=data, ax=ax)
63
+ ax.set_title(f'Bar Plot of {selected_cat_col}')
64
  st.pyplot(fig)
65
  else:
66
  st.write("No categorical columns available for analysis.")
67
 
68
  # Correlation matrix
 
69
  if len(numeric_columns) > 1:
70
  st.write("### Correlation Matrix:")
71
+ corr_matrix = data[numeric_columns].corr()
72
+ st.write(corr_matrix)
73
 
74
  st.write("Heatmap of Correlation Matrix:")
75
+ fig, ax = plt.subplots(figsize=(10, 8))
76
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
77
  st.pyplot(fig)
78
 
79
  # Clean the data: Handle missing values and duplicates
80
  st.write("### Cleaned Dataset:")
81
  cleaned_data = data.drop_duplicates() # Remove duplicate rows
 
82
  st.dataframe(cleaned_data)
83
 
84
  # Download button for the cleaned dataset
 
95
  You can now download the cleaned dataset and proceed with further analysis or modeling.
96
  """)
97
  else:
98
+ st.warning("Please upload a dataset to proceed with Simple EDA.")