Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Model / pages /Simple EDA.py

Mpavan45

Update pages/Simple EDA.py

54cdf72 verified 12 months ago

raw

history blame contribute delete

3.49 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Configure the Streamlit app
	st.title("Exploratory Data Analysis (EDA) App")
	st.markdown("""
	By performing simple Exploratory Data Analysis (EDA), we can examine the data, identify patterns, and detect anomalies or inconsistencies. This process allows us to clean and preprocess the dataset effectively, ensuring it is well-structured and ready for further analysis or modeling. Simple EDA helps uncover hidden insights, address missing or erroneous values, and optimize the data for better decision-making.

	""")

	# File uploader for dataset
	uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])

	if uploaded_file is not None:
	# Read and display the dataset
	data = pd.read_csv(uploaded_file)
	st.write("### Uploaded Dataset:")
	st.dataframe(data)

	# Overview of the dataset
	st.write("### Dataset Overview:")
	st.write(data.describe())

	# Missing values in the dataset
	st.write("### Missing Values:")
	st.write(data.isnull().sum())

	# Duplicate rows in the dataset
	st.write("### Duplicate Rows:")
	st.write(f"Number of duplicate rows: {data.duplicated().sum()}")

	# Visualizations for numeric columns
	st.write("### Numeric Column Visualizations:")
	st.write("Histograms:")
	fig, ax = plt.subplots()
	data.hist(ax=ax, figsize=(10, 8))
	st.pyplot(fig)

	st.write("Boxplot:")
	fig, ax = plt.subplots()
	sns.boxplot(data=data, orient='h', ax=ax)
	st.pyplot(fig)

	# Value counts and bar plot for categorical data
	categorical_columns = data.select_dtypes(include=['object', 'category']).columns
	if len(categorical_columns) > 0:
	selected_cat_col = st.selectbox("Select a Categorical Column to Analyze", categorical_columns)
	st.write(f"Value Counts for '{selected_cat_col}':")
	st.write(data[selected_cat_col].value_counts())

	st.write(f"Bar Plot for '{selected_cat_col}':")
	fig, ax = plt.subplots()
	sns.countplot(x=selected_cat_col, data=data, ax=ax)
	st.pyplot(fig)
	else:
	st.write("No categorical columns available for analysis.")

	# Correlation matrix
	numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_columns) > 1:
	st.write("### Correlation Matrix:")
	st.write(data[numeric_columns].corr())

	st.write("Heatmap of Correlation Matrix:")
	fig, ax = plt.subplots()
	sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# Clean the data: Handle missing values and duplicates
	st.write("### Cleaned Dataset:")
	cleaned_data = data.drop_duplicates() # Remove duplicate rows
	#cleaned_data = cleaned_data.fillna(cleaned_data.mean()) # Replace missing values with the mean for numeric columns
	st.dataframe(cleaned_data)

	# Download button for the cleaned dataset
	cleaned_csv = cleaned_data.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download Cleaned Dataset",
	data=cleaned_csv,
	file_name="cleaned_dataset.csv",
	mime="text/csv"
	)

	st.markdown("""
	This analysis provides a basic understanding of the dataset.
	You can now download the cleaned dataset and proceed with further analysis or modeling.
	""")
	else:
	st.warning("Please upload a dataset to proceed with Simple EDA.")