trohith89 commited on
Commit
81da568
·
verified ·
1 Parent(s): 7f0a1d6

Update pages/2_Data_CLeaning_and_Preprocessing.py

Browse files
pages/2_Data_CLeaning_and_Preprocessing.py CHANGED
@@ -1,17 +1,40 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import plotly.graph_objects as go
4
- import plotly.express as px
5
- from plotly.subplots import make_subplots
6
  from io import StringIO
 
7
 
8
  # Page Title
9
- st.markdown("<h1 style='text-align:center; color:white;'>Data Cleaning and Preprocessing</h1>", unsafe_allow_html=True)
10
 
11
- # Define the URL of the background image (use your own image URL)
12
- background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/67441c51a784a9d15cb12871/clljdAv7f_LGL8dH5vCZQ.jpeg"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
 
 
14
  # Apply custom CSS for the background image and overlay
 
 
15
  st.markdown(
16
  f"""
17
  <style>
@@ -33,84 +56,12 @@ st.markdown(
33
  background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
34
  z-index: -1;
35
  }}
 
 
 
 
 
36
  </style>
37
  """,
38
  unsafe_allow_html=True
39
- )
40
-
41
- # Check if the dataset is already in session state
42
- data = st.session_state.get("df")
43
-
44
- if data is not None:
45
- st.subheader("Dataset Preview:")
46
- st.write(data.head())
47
-
48
- st.subheader("Dataset Overview:")
49
- st.write(data.describe())
50
-
51
- st.subheader("Missing Values:")
52
- st.write(data.isnull().sum())
53
-
54
- st.subheader("Duplicate Rows:")
55
- st.write(f"Number of duplicate rows: {data.duplicated().sum()}")
56
-
57
- # Visualize Numeric Data
58
- numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
59
- if len(numeric_columns) > 0:
60
- st.subheader("Histograms for Numeric Columns:")
61
- fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
62
- for i, col in enumerate(numeric_columns):
63
- hist = px.histogram(data, x=col, nbins=30, title=f'Histogram of {col}')
64
- fig.add_trace(hist.data[0], row=i + 1, col=1)
65
-
66
- fig.update_layout(height=500 * len(numeric_columns), title_text="Histograms for Numeric Columns")
67
- st.plotly_chart(fig)
68
-
69
- st.subheader("Boxplots for Numeric Columns:")
70
- fig = make_subplots(rows=len(numeric_columns), cols=1, subplot_titles=numeric_columns)
71
- for i, col in enumerate(numeric_columns):
72
- boxplot = px.box(data, y=col, title=f'Boxplot of {col}')
73
- fig.add_trace(boxplot.data[0], row=i + 1, col=1)
74
-
75
- fig.update_layout(height=500 * len(numeric_columns), title_text="Boxplots for Numeric Columns")
76
- st.plotly_chart(fig)
77
- else:
78
- st.warning("No numeric columns available for visualization.")
79
-
80
- # Visualize Categorical Data
81
- categorical_columns = data.select_dtypes(include=['object', 'category']).columns
82
- if len(categorical_columns) > 0:
83
- st.subheader("Bar Plots for Categorical Columns:")
84
- selected_cat_col = st.selectbox("Select a Categorical Column", categorical_columns)
85
-
86
- st.write(f"Value Counts for '{selected_cat_col}':")
87
- st.write(data[selected_cat_col].value_counts())
88
-
89
- fig = px.bar(data, x=selected_cat_col, title=f'Bar Plot of {selected_cat_col}', color=selected_cat_col)
90
- st.plotly_chart(fig)
91
- else:
92
- st.warning("No categorical columns available for visualization.")
93
-
94
- # Correlation Matrix for Numeric Columns
95
- if len(numeric_columns) > 1:
96
- st.subheader("Correlation Matrix:")
97
- corr_matrix = data[numeric_columns].corr()
98
- fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale='coolwarm')
99
- st.plotly_chart(fig)
100
-
101
- st.subheader("Cleaned Dataset:")
102
- cleaned_data = data.drop_duplicates()
103
- st.write(cleaned_data)
104
-
105
- cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
106
- st.download_button(
107
- label="Download Cleaned Dataset",
108
- data=cleaned_csv,
109
- file_name="cleaned_dataset.csv",
110
- mime="text/csv"
111
- )
112
-
113
- st.session_state['df'] = cleaned_data
114
-
115
- else:
116
- st.warning("No dataset found. Please upload a dataset on the Home page.")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import os
 
 
4
  from io import StringIO
5
+ import sys
6
 
7
  # Page Title
8
+ st.markdown("<h1 style='text-align:center; color:#008080;'>Data Cleaning and Processing</h1>", unsafe_allow_html=True)
9
 
10
+ # Access dataset from session state
11
+ df = st.session_state.get("dataset")
12
+
13
+ if df is not None:
14
+ st.subheader("Dataset Preview:")
15
+ st.write(df.head())
16
+
17
+ st.subheader("Info of the Dataset:")
18
+ # Redirect the output of df.info() to a string buffer
19
+ buffer = StringIO()
20
+ df.info(buf=buffer)
21
+
22
+ # Display the content in Streamlit
23
+ st.write(buffer.getvalue())
24
+
25
+ st.subheader("Dataset Description:")
26
+ st.write(df.describe())
27
+
28
+ st.subheader("Shape of the Dataset:")
29
+ st.write(df.shape)
30
+ else:
31
+ st.warning("No dataset found. Please upload a dataset on the Home page.")
32
 
33
+
34
+ # Define the URL of the background image (use your own image URL)
35
  # Apply custom CSS for the background image and overlay
36
+ background_image_url = "https://cdn-uploads.huggingface.co/production/uploads/675fab3a2d0851e23d23cad3/FlisTFfpU7flDCWj_KttH.jpeg"
37
+
38
  st.markdown(
39
  f"""
40
  <style>
 
56
  background: rgba(0, 0, 0, 0.4); /* Adjust transparency here (0.4 for 40% transparency) */
57
  z-index: -1;
58
  }}
59
+ /* Styling the content to ensure text visibility */
60
+ .stMarkdown {{
61
+ color: black; /* White text to ensure visibility */
62
+ font-size: 30px; /* Adjust font size for better readability */
63
+ }}
64
  </style>
65
  """,
66
  unsafe_allow_html=True
67
+ )