Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Model / pages /EDA .py

Mpavan45

Rename pages/EDA and Feature Engineering.py to pages/EDA .py

7b7ee83 verified over 1 year ago

raw

history blame

7.25 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Function to generate automatic insights for univariate analysis
	def generate_univariate_insights(data, column):
	mean_val = data[column].mean()
	median_val = data[column].median()
	std_val = data[column].std()
	min_val = data[column].min()
	max_val = data[column].max()

	insights = f"""
	- The mean value of '{column}' is {mean_val:.2f}.
	- The median value is {median_val:.2f}, indicating the central tendency of the data.
	- The standard deviation is {std_val:.2f}, suggesting the spread of the values.
	- The minimum value observed is {min_val}, and the maximum value is {max_val}.
	"""
	return insights

	# Function to generate automatic insights for bivariate analysis (scatter plot)
	def generate_bivariate_insights(data, col1, col2):
	correlation = data[col1].corr(data[col2])

	insights = f"""
	- The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
	- A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
	- A correlation near 0 suggests no linear relationship between the variables.
	"""
	return insights

	# Function to generate automatic insights for multivariate analysis (pairplot)
	def generate_multivariate_insights(data, columns):
	correlations = data[columns].corr()
	insights = f"""
	- The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
	- The diagonal displays the distributions of each variable.
	- Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
	"""
	return insights

	# Introduction to EDA
	st.markdown("""
	# Exploratory Data Analysis (EDA)
	Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
	- Understanding the Structure: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
	- Visualizing Distributions: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
	- Finding Relationships: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.

	EDA helps in:
	- Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
	- Gaining insights that can inform further analysis or modeling steps.
	""")

	# File uploader for dataset
	uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])

	if uploaded_file is not None:
	# Read and display the dataset
	data = pd.read_csv(uploaded_file)
	st.write("### Uploaded Dataset:")
	st.dataframe(data)

	# Dataset Overview
	st.write("### Dataset Overview:")
	st.write(data.describe())

	# Missing values in the dataset
	st.write("### Missing Values:")
	st.write(data.isnull().sum())

	# Correlation matrix for numerical columns
	st.write("### Correlation Matrix:")
	numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_columns) > 1:
	st.write(data[numeric_columns].corr())

	st.write("Heatmap of Correlation Matrix:")
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# Univariate Plots (For a single column)
	st.write("### Univariate Analysis: Distribution of Columns")
	selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)

	# Histogram for univariate distribution
	st.write(f"Histogram for '{selected_numeric_column}':")
	fig, ax = plt.subplots()
	sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
	st.pyplot(fig)

	# Display automatic insights for univariate analysis
	univariate_insights = generate_univariate_insights(data, selected_numeric_column)
	st.write("### Insights:")
	st.write(univariate_insights)

	# Boxplot for univariate distribution
	st.write(f"Boxplot for '{selected_numeric_column}':")
	fig, ax = plt.subplots()
	sns.boxplot(x=data[selected_numeric_column], ax=ax)
	st.pyplot(fig)

	# Bivariate Plots (For two columns)
	st.write("### Bivariate Analysis: Relationships between Two Variables")
	selected_bivariate_columns = st.multiselect(
	"Select Two Columns for Bivariate Analysis",
	options=numeric_columns,
	default=numeric_columns[:2]
	)

	if len(selected_bivariate_columns) == 2:
	st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
	fig, ax = plt.subplots()
	sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
	st.pyplot(fig)

	# Display automatic insights for bivariate analysis
	bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
	st.write("### Insights:")
	st.write(bivariate_insights)

	# Multivariate Plots (For multiple columns)
	st.write("### Multivariate Analysis: Relationships between Multiple Variables")
	selected_multivariate_columns = st.multiselect(
	"Select Columns for Multivariate Analysis",
	options=numeric_columns,
	default=numeric_columns[:3]
	)

	if len(selected_multivariate_columns) > 1:
	st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.pairplot(data[selected_multivariate_columns])
	st.pyplot(fig)

	# Display automatic insights for multivariate analysis
	multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
	st.write("### Insights:")
	st.write(multivariate_insights)

	# Categorical vs Numeric (boxplots)
	categorical_columns = data.select_dtypes(include=['object', 'category']).columns
	if len(categorical_columns) > 0:
	selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)

	st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
	selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
	fig, ax = plt.subplots()
	sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
	st.pyplot(fig)

	st.write(f"### Insights:")
	st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")

	# Download the cleaned dataset if needed
	st.markdown("""
	This analysis provides a basic understanding of the dataset.
	You can now proceed with further analysis or modeling.
	""")
	else:
	st.warning("Please upload a dataset to proceed with EDA.")