Spaces:

Mpavan45
/

Model

Sleeping

App Files Files Community

Mpavan45 commited on Jan 6, 2025

Commit

7b7ee83

verified ·

1 Parent(s): bc48345

Rename pages/EDA and Feature Engineering.py to pages/EDA .py

Browse files

Files changed (2) hide show

pages/EDA .py +163 -0
pages/EDA and Feature Engineering.py +0 -40

pages/EDA .py ADDED Viewed

	@@ -0,0 +1,163 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+# Function to generate automatic insights for univariate analysis
+def generate_univariate_insights(data, column):
+    mean_val = data[column].mean()
+    median_val = data[column].median()
+    std_val = data[column].std()
+    min_val = data[column].min()
+    max_val = data[column].max()
+    insights = f"""
+    - The mean value of '{column}' is {mean_val:.2f}.
+    - The median value is {median_val:.2f}, indicating the central tendency of the data.
+    - The standard deviation is {std_val:.2f}, suggesting the spread of the values.
+    - The minimum value observed is {min_val}, and the maximum value is {max_val}.
+    """
+    return insights
+# Function to generate automatic insights for bivariate analysis (scatter plot)
+def generate_bivariate_insights(data, col1, col2):
+    correlation = data[col1].corr(data[col2])
+    insights = f"""
+    - The correlation between '{col1}' and '{col2}' is {correlation:.2f}.
+    - A correlation close to 1 indicates a strong positive relationship, while a correlation close to -1 indicates a strong negative relationship.
+    - A correlation near 0 suggests no linear relationship between the variables.
+    """
+    return insights
+# Function to generate automatic insights for multivariate analysis (pairplot)
+def generate_multivariate_insights(data, columns):
+    correlations = data[columns].corr()
+    insights = f"""
+    - The pairplot shows the relationships between the selected numeric variables: {', '.join(columns)}.
+    - The diagonal displays the distributions of each variable.
+    - Strong correlations (positive or negative) can be seen in the scatter plots between some variables.
+    """
+    return insights
+# Introduction to EDA
+st.markdown("""
+# Exploratory Data Analysis (EDA)
+Exploratory Data Analysis (EDA) is an essential step in the data analysis process. It involves:
+- **Understanding the Structure**: By examining the dataset’s statistics and structure, we can identify patterns, trends, and potential issues.
+- **Visualizing Distributions**: Histograms and boxplots give insight into the distribution of data, the spread of numerical values, and the presence of any outliers.
+- **Finding Relationships**: Through scatter plots and correlation matrices, we can identify relationships between two or more variables, which helps in building predictive models.
+EDA helps in:
+- Cleaning the dataset by handling missing values, detecting outliers, and fixing errors.
+- Gaining insights that can inform further analysis or modeling steps.
+""")
+# File uploader for dataset
+uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
+if uploaded_file is not None:
+    # Read and display the dataset
+    data = pd.read_csv(uploaded_file)
+    st.write("### Uploaded Dataset:")
+    st.dataframe(data)
+    # Dataset Overview
+    st.write("### Dataset Overview:")
+    st.write(data.describe())
+    # Missing values in the dataset
+    st.write("### Missing Values:")
+    st.write(data.isnull().sum())
+    # Correlation matrix for numerical columns
+    st.write("### Correlation Matrix:")
+    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
+    if len(numeric_columns) > 1:
+        st.write(data[numeric_columns].corr())
+        st.write("Heatmap of Correlation Matrix:")
+        fig, ax = plt.subplots(figsize=(10, 8))
+        sns.heatmap(data[numeric_columns].corr(), annot=True, cmap='coolwarm', ax=ax)
+        st.pyplot(fig)
+    # Univariate Plots (For a single column)
+    st.write("### Univariate Analysis: Distribution of Columns")
+    selected_numeric_column = st.selectbox("Select a Numeric Column for Univariate Analysis", numeric_columns)
+    # Histogram for univariate distribution
+    st.write(f"Histogram for '{selected_numeric_column}':")
+    fig, ax = plt.subplots()
+    sns.histplot(data[selected_numeric_column], kde=True, ax=ax)
+    st.pyplot(fig)
+    # Display automatic insights for univariate analysis
+    univariate_insights = generate_univariate_insights(data, selected_numeric_column)
+    st.write("### Insights:")
+    st.write(univariate_insights)
+    # Boxplot for univariate distribution
+    st.write(f"Boxplot for '{selected_numeric_column}':")
+    fig, ax = plt.subplots()
+    sns.boxplot(x=data[selected_numeric_column], ax=ax)
+    st.pyplot(fig)
+    # Bivariate Plots (For two columns)
+    st.write("### Bivariate Analysis: Relationships between Two Variables")
+    selected_bivariate_columns = st.multiselect(
+        "Select Two Columns for Bivariate Analysis",
+        options=numeric_columns,
+        default=numeric_columns[:2]
+    )
+    if len(selected_bivariate_columns) == 2:
+        st.write(f"Scatter Plot between '{selected_bivariate_columns[0]}' and '{selected_bivariate_columns[1]}':")
+        fig, ax = plt.subplots()
+        sns.scatterplot(x=data[selected_bivariate_columns[0]], y=data[selected_bivariate_columns[1]], ax=ax)
+        st.pyplot(fig)
+        # Display automatic insights for bivariate analysis
+        bivariate_insights = generate_bivariate_insights(data, selected_bivariate_columns[0], selected_bivariate_columns[1])
+        st.write("### Insights:")
+        st.write(bivariate_insights)
+    # Multivariate Plots (For multiple columns)
+    st.write("### Multivariate Analysis: Relationships between Multiple Variables")
+    selected_multivariate_columns = st.multiselect(
+        "Select Columns for Multivariate Analysis",
+        options=numeric_columns,
+        default=numeric_columns[:3]
+    )
+    if len(selected_multivariate_columns) > 1:
+        st.write(f"Pairplot for selected variables: {', '.join(selected_multivariate_columns)}")
+        fig, ax = plt.subplots(figsize=(10, 8))
+        sns.pairplot(data[selected_multivariate_columns])
+        st.pyplot(fig)
+        # Display automatic insights for multivariate analysis
+        multivariate_insights = generate_multivariate_insights(data, selected_multivariate_columns)
+        st.write("### Insights:")
+        st.write(multivariate_insights)
+    # Categorical vs Numeric (boxplots)
+    categorical_columns = data.select_dtypes(include=['object', 'category']).columns
+    if len(categorical_columns) > 0:
+        selected_cat_column = st.selectbox("Select a Categorical Column for Analysis", categorical_columns)
+        st.write(f"Boxplot for '{selected_cat_column}' vs Numeric Column:")
+        selected_numeric_column_for_cat = st.selectbox("Select a Numeric Column to Plot", numeric_columns)
+        fig, ax = plt.subplots()
+        sns.boxplot(x=data[selected_cat_column], y=data[selected_numeric_column_for_cat], ax=ax)
+        st.pyplot(fig)
+        st.write(f"### Insights:")
+        st.write(f"Boxplot shows the distribution of '{selected_numeric_column_for_cat}' values for each category in '{selected_cat_column}'. It helps identify if the numerical values differ across categories.")
+    # Download the cleaned dataset if needed
+    st.markdown("""
+    This analysis provides a basic understanding of the dataset.
+    You can now proceed with further analysis or modeling.
+    """)
+else:
+    st.warning("Please upload a dataset to proceed with EDA.")

pages/EDA and Feature Engineering.py DELETED Viewed

@@ -1,40 +0,0 @@
-import streamlit as st
-import pandas as pd
-# EDA and Feature Engineering Page
-st.title("EDA and Feature Engineering")
-st.markdown("""
-This section is dedicated to exploratory data analysis (EDA) and feature engineering.
-You can upload your dataset and analyze it here.
-""")
-# File uploader for dataset
-uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"])
-if uploaded_file is not None:
-    # Read and display the dataset
-    data = pd.read_csv(uploaded_file)
-    st.write("### Uploaded Dataset:")
-    st.dataframe(data)
-    # Overview of the dataset
-    st.write("### Dataset Overview:")
-    st.write(data.describe())
-    # Missing values in the dataset
-    st.write("### Missing Values:")
-    st.write(data.isnull().sum())
-    # Correlation matrix
-    st.write("### Correlation Matrix:")
-    st.write(data.corr())
-    st.markdown("""
-    Based on the insights from this analysis, you can proceed to perform feature engineering by:
-    - Handling missing values.
-    - Creating or transforming features.
-    - Encoding categorical variables.
-    - Normalizing or scaling numerical features.
-    """)
-else:
-    st.warning("Please upload a dataset to proceed with EDA.")