import streamlit as st import pandas as pd import plotly.express as px import functions import streamlit as st st.set_page_config(layout="wide", page_icon='logo.png', page_title='EDA') # Header section with title and description #st.header("π DataLens: Visualize and Uncover Insights π") # Header section with title and description st.markdown('
Welcome to DataLens: Visualize and Uncover Insights!
', unsafe_allow_html=True) st.markdown('DataLens is an interactive Exploratory Data Analysis (EDA) tool designed to empower data scientists and analysts in uncovering valuable insights from their datasets. With DataLens, you can dive deep into your data, visualize patterns, and gain a comprehensive understanding of its characteristics.
', unsafe_allow_html=True) st.markdown('Unlock the potential of your data with a range of powerful features:
', unsafe_allow_html=True) # Introduction section with interactive checklist st.markdown('You will be able toβ :
', unsafe_allow_html=True) st.markdown('1. See the whole dataset
', unsafe_allow_html=True) st.markdown('2. Get column names, data types info
', unsafe_allow_html=True) st.markdown('3. Get the count and percentage of NA values
', unsafe_allow_html=True) st.markdown('4. Get descriptive analysis
', unsafe_allow_html=True) st.markdown('5. Check imbalance or distribution of target variable:
', unsafe_allow_html=True) st.markdown('6. See distribution of numerical columns
', unsafe_allow_html=True) st.markdown('7. See count plot of categorical columns
', unsafe_allow_html=True) st.markdown('8. Get outlier analysis with box plots
', unsafe_allow_html=True) st.markdown('9. Obtain info of target value variance with categorical columns
', unsafe_allow_html=True) # Testimonials or User Feedback st.subheader("What Users Are Saying") st.markdown('"The EDA tool helped me uncover valuable insights in my data. Highly recommended!"
', unsafe_allow_html=True) st.markdown('- John Doe, Data Scientist
', unsafe_allow_html=True) functions.space() st.write('Import Dataset
', unsafe_allow_html=True) file_format = st.radio('Select file format:', ('csv', 'excel'), key='file_format') dataset = st.file_uploader(label = 'Upload dataset file') #use_defo = st.checkbox('Use example Dataset') #if use_defo: # dataset = 'CarPrice_Assignment.csv' st.sidebar.header('Import Dataset to Use Available Features: π') if dataset: if file_format == 'csv': df = pd.read_csv(dataset) else: df = pd.read_excel(dataset) st.subheader('Dataframe:') st.write('First 4 records of the dataset:') st.write(df.head(4)) st.subheader('Dataframe:') n, m = df.shape st.write(f'Dataset contains {n} rows and {m} columns.
', unsafe_allow_html=True) st.dataframe(df) all_vizuals = ['Info', 'NA Info', 'Descriptive Analysis', 'Target Analysis', 'Distribution of Numerical Columns', 'Count Plots of Categorical Columns', 'Box Plots', 'Outlier Analysis', 'Variance of Target with Categorical Columns', 'Skewness and Kurtosis'] functions.sidebar_space(3) vizuals = st.sidebar.multiselect("Choose which visualizations you want to see π", all_vizuals) if 'Info' in vizuals: st.subheader('Info:') st.dataframe(functions.df_info(df)) functions.space(2) if 'NA Info' in vizuals: st.subheader('NA Value Information:') if df.isnull().sum().sum() == 0: st.write('There are no missing values in your dataset.') else: st.dataframe(functions.df_isnull(df), width=1500) functions.space(2) if 'Descriptive Analysis' in vizuals: st.subheader('Descriptive Analysis:') st.dataframe(df.describe()) functions.space(2) if 'Target Analysis' in vizuals: if len(df.columns) == 0: st.error("No columns found in the dataset.") else: target_column = st.selectbox("Select target column:", df.columns, index=len(df.columns) - 1) if target_column: if df[target_column].dtype.kind in 'biufc': st.subheader("Histogram of target column") fig = px.histogram(df, x=target_column) c1, c2, c3 = st.columns([0.5, 2, 0.5]) c2.plotly_chart(fig) else: st.error("Selected target column is not numeric.") else: st.warning("No target column selected.") num_columns = df.select_dtypes(exclude='object').columns if 'Distribution of Numerical Columns' in vizuals: if len(num_columns) == 0: st.write('There are no numerical columns in the data.') else: selected_num_cols = st.sidebar.multiselect('Choose columns for Distribution plots:', num_columns, key='Distribution') st.subheader('Distribution of numerical columns') for col in selected_num_cols: fig = px.histogram(df, x=col) st.plotly_chart(fig, use_container_width=True) cat_columns = df.select_dtypes(include='object').columns if 'Count Plots of Categorical Columns' in vizuals: if len(cat_columns) == 0: st.write('There are no categorical columns in the data.') else: selected_cat_cols = st.sidebar.multiselect('Choose columns for Count plots:', cat_columns, key='Count') st.subheader('Count plots of categorical columns') for col in selected_cat_cols: fig = px.histogram(df, x=col, color_discrete_sequence=['indianred']) st.plotly_chart(fig) if 'Box Plots' in vizuals: if len(num_columns) == 0: st.write('There are no numerical columns in the data.') else: selected_num_cols = st.sidebar.multiselect('Choose columns for Box plots:', num_columns, key='Box') st.subheader('Box plots') for col in selected_num_cols: fig = px.box(df, y=col) st.plotly_chart(fig, use_container_width=True) if 'Outlier Analysis' in vizuals: st.subheader('Outlier Analysis') num_columns = df.select_dtypes(exclude='object').columns if len(num_columns) == 0: st.write('There are no numerical columns in the data.') else: selected_num_cols = st.sidebar.multiselect('Choose columns for Outlier Analysis:', num_columns, key='Outlier') if len(selected_num_cols) == 0: st.write('No columns selected for Outlier Analysis.') else: for col in selected_num_cols: st.subheader(f'Outliers in column: {col}') outliers = functions.number_of_outliers(df[col]) st.write(outliers) functions.space(2) if 'Variance of Target with Categorical Columns' in vizuals: df_1 = df.dropna() high_cardi_columns = [] normal_cardi_columns = [] for i in cat_columns: if df[i].nunique() > df.shape[0] / 10: high_cardi_columns.append(i) else: normal_cardi_columns.append(i) if len(normal_cardi_columns) == 0: st.write('There are no categorical columns with normal cardinality in the data.') else: st.subheader('Variance of target variable with categorical columns') model_type = st.radio('Select Problem Type:', ('Regression', 'Classification'), key='model_type') if 'Target Analysis' not in vizuals: target_column = st.selectbox("Select target column:", df.columns, index=len(df.columns) - 1) selected_cat_cols = st.sidebar.multiselect('Choose columns for Category Colored plots:', normal_cardi_columns, key='Category') for col in selected_cat_cols: if model_type == 'Regression': fig = px.box(df_1, y=target_column, color=col) else: fig = px.histogram(df_1, color=col, x=target_column) st.plotly_chart(fig, use_container_width=True) if high_cardi_columns: st.subheader('The following columns have high cardinality, which is why their boxplots were not plotted:') for col in high_cardi_columns: st.write(col) st.markdown('Do you want to plot anyway?
', unsafe_allow_html=True) answer = st.selectbox("", ('No', 'Yes')) if answer == 'Yes': for col in high_cardi_columns: fig = px.box(df_1, y=target_column, color=col) st.plotly_chart(fig, use_container_width=True) if 'Skewness and Kurtosis' in vizuals: if len(num_columns) == 0: st.write('There are no numerical columns in the data.') else: st.subheader('Skewness and Kurtosis') for col in num_columns: skewness = df[col].skew() kurtosis = df[col].kurtosis() st.write(f'Skewness of {col}: {skewness}') st.write(f'Kurtosis of {col}: {kurtosis}') functions.space(2) if 'Target Correlation' in vizuals: if len(df.columns) == 0: st.error("No columns found in the dataset.") else: selected_columns = st.multiselect("Select columns for target correlation:", df.columns) if len(selected_columns) == 0: st.warning("No columns selected for target correlation.") else: target_column = st.selectbox("Select target column:", df.columns, index=len(df.columns) - 1) st.subheader(f'Correlation between selected columns and target variable: {target_column}') selected_columns_data = df[selected_columns] selected_columns_corr = pd.DataFrame() for col in selected_columns: if selected_columns_data[col].dtype == 'object': encoded_cols = pd.get_dummies(selected_columns_data[col], prefix=col) corr = np.corrcoef(encoded_cols, df[target_column])[0, 1] else: corr = selected_columns_data[col].corr(df[target_column]) selected_columns_corr.loc[col, 'Correlation'] = corr st.dataframe(selected_columns_corr) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x=selected_columns_corr.index, y='Correlation', data=selected_columns_corr, ax=ax) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') ax.set_xlabel('Column') ax.set_ylabel('Correlation') ax.set_title(f'Correlation between selected columns and target variable: {target_column}') st.pyplot(fig)