Spaces:

MahatirTusher
/

DataGenie

Sleeping

App Files Files Community

MahatirTusher commited on Apr 15, 2025

Commit

23f3efd

verified ·

1 Parent(s): a133fd6

Update app.py

Browse files

Files changed (1) hide show

app.py +398 -16

app.py CHANGED Viewed

@@ -128,7 +128,7 @@ def add_footer():
         """
         <footer>
             Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
-            Inspired by the project "Predicta" by Ahmed Nafiz.
         </footer>
         """,
         unsafe_allow_html=True
@@ -146,7 +146,7 @@ def add_sidebar():
         "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
     )
     st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
-    st.sidebar.write("**Inspired by:** Predicta by Ahmed Nafiz")
     st.sidebar.markdown("---")
     st.sidebar.write("**Your**")
     st.sidebar.image(
@@ -367,45 +367,427 @@ def download_image(fig, key_prefix):
 # Data processing functions
 def dataset_overview(df):
     st.subheader("Dataset Overview")
-    st.write(f"Rows: {len(df)}, Columns: {len(df.columns)}")
-    st.write("Data Types:", df.dtypes)
-    st.write(df.head())
 def clean_data(df):
     st.subheader("Clean Data")
     cleaned_df = df.dropna().drop_duplicates()
-    st.write("Cleaned Dataset:", cleaned_df.head())
     return cleaned_df
 def detect_outlier(df):
     st.subheader("Detect Outliers")
     numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
     for col in numerical_cols:
         Q1, Q3 = df[col].quantile([0.25, 0.75])
         IQR = Q3 - Q1
-        outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
-        if not outliers.empty:
-            st.write(f"Outliers in {col}:", outliers)
 def encoder(df):
     st.subheader("Encode Data")
-    le = LabelEncoder()
-    encoded_df = df.copy()
-    for col in df.select_dtypes(include=['object', 'category']).columns:
-        encoded_df[col] = le.fit_transform(df[col])
-    st.write("Encoded Dataset:", encoded_df.head())
-    return encoded_df
 def data_transformer(df):
     st.subheader("Data Transformer")
     transformed_df = df.copy()  # Placeholder for future transformations
     st.write("Transformed Dataset:", transformed_df.head())
     return transformed_df
 def data_analysis(df):
     st.subheader("Data Analysis")
-    st.write(df.describe())
 def feature_importance_analyzer(df):
     st.subheader("Feature Importance Analyzer")
     target_column = st.selectbox("Select Target Column", df.columns)

         """
         <footer>
             Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>.
+            Inspired by the project "Predicta" by <a href="https://github.com/ahammadnafiz" target="_blank"> Ahammad Nafiz </a>.
         </footer>
         """,
         unsafe_allow_html=True
         "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration."
     )
     st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher")
+    st.sidebar.write("**Inspired by:** Predicta by Ahammad Nafiz")
     st.sidebar.markdown("---")
     st.sidebar.write("**Your**")
     st.sidebar.image(
 # Data processing functions
 def dataset_overview(df):
     st.subheader("Dataset Overview")
+    st.markdown("#### Basic Information")
+    st.write(f"**Rows**: {len(df):,} | **Columns**: {len(df.columns):,}")
+    st.write(f"**Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
+    st.markdown("#### Data Types and Missing Values")
+    dtypes_df = pd.DataFrame({
+        "Column": df.columns,
+        "Data Type": df.dtypes,
+        "Non-Null Count": df.count(),
+        "Missing Values": df.isna().sum(),
+        "Missing %": (df.isna().sum() / len(df) * 100).round(2)
+    }).reset_index(drop=True)
+    st.dataframe(dtypes_df.style.highlight_null(color='lightcoral'))
+    st.markdown("#### Numerical Columns Summary")
+    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
+    if numerical_cols.size > 0:
+        numerical_summary = df[numerical_cols].describe().T.round(2)
+        numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
+        numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
+        st.dataframe(numerical_summary)
+    st.markdown("#### Categorical Columns Summary")
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+    if categorical_cols.size > 0:
+        for col in categorical_cols:
+            value_counts = df[col].value_counts().head(5)
+            st.write(f"**{col}** (Top 5 values):")
+            st.dataframe(pd.DataFrame({
+                "Value": value_counts.index,
+                "Count": value_counts.values,
+                "% of Total": (value_counts.values / len(df) * 100).round(2)
+            }))
+    st.markdown("#### Duplicate Rows")
+    duplicate_count = df.duplicated().sum()
+    if duplicate_count > 0:
+        st.warning(f"Found {duplicate_count} duplicate rows ({duplicate_count / len(df) * 100:.2f}% of total).")
+    else:
+        st.success("No duplicate rows detected.")
+    st.markdown("#### Sample Data (First 5 Rows)")
+    st.dataframe(df.head())
 def clean_data(df):
     st.subheader("Clean Data")
+    st.markdown("#### Missing Values")
+    missing_values = df.isnull().sum()
+    missing_percentage = (missing_values / len(df) * 100).round(2)
+    missing_summary = pd.DataFrame({
+        "Missing Values": missing_values,
+        "Missing Percentage (%)": missing_percentage
+    }).sort_values(by="Missing Values", ascending=False)
+    st.dataframe(missing_summary)
+    st.markdown("#### Duplicate Rows")
+    duplicate_count = df.duplicated().sum()
+    if duplicate_count > 0:
+        st.warning(f"Found {duplicate_count} duplicate rows. They will be removed.")
+    else:
+        st.success("No duplicate rows detected.")
     cleaned_df = df.dropna().drop_duplicates()
+    st.write(f"Cleaned Dataset: {len(cleaned_df)} rows remaining after cleaning.")
+    st.dataframe(cleaned_df.head())
     return cleaned_df
 def detect_outlier(df):
     st.subheader("Detect Outliers")
     numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
+    if not numerical_cols.any():
+        st.warning("No numerical columns available for outlier detection.")
+        return
+    st.markdown("#### Outlier Detection Summary")
+    outlier_summary = []
     for col in numerical_cols:
         Q1, Q3 = df[col].quantile([0.25, 0.75])
         IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
+        outlier_count = len(outliers)
+        outlier_percentage = round((outlier_count / len(df) * 100), 2)
+        outlier_summary.append({
+            "Column": col,
+            "Outliers": outlier_count,
+            "Outlier Percentage (%)": outlier_percentage
+        })
+    outlier_df = pd.DataFrame(outlier_summary).sort_values(by="Outliers", ascending=False)
+    st.dataframe(outlier_df)
+    st.markdown("#### Outlier Visualization")
+    selected_col = st.selectbox("Select a column to visualize outliers", numerical_cols)
+    if selected_col:
+        Q1, Q3 = df[selected_col].quantile([0.25, 0.75])
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        fig, ax = plt.subplots()
+        sns.boxplot(data=df, x=selected_col, ax=ax)
+        ax.axhline(lower_bound, color='red', linestyle='--', label='Lower Bound')
+        ax.axhline(upper_bound, color='blue', linestyle='--', label='Upper Bound')
+        ax.legend()
+        st.pyplot(fig)
+        plt.close(fig)
+# Data Encoder
 def encoder(df):
+    """
+    Encodes categorical columns in the dataset using user-selected methods (Label Encoding,
+    One-Hot Encoding, or Frequency Encoding). Provides control over column selection, handles
+    missing values, and displays encoding details.
+    Args:
+        df (pd.DataFrame): Input dataset to encode.
+    Returns:
+        pd.DataFrame: Encoded dataset.
+    """
     st.subheader("Encode Data")
+    # Initialize session state for encoded DataFrame
+    if 'encoded_df' not in st.session_state:
+        st.session_state.encoded_df = df.copy()
+    # Identify categorical columns
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    if not categorical_cols:
+        st.warning("No categorical columns ('object' or 'category') found in the dataset.")
+        return df
+    # Display original categorical columns
+    st.markdown("### Categorical Columns Detected")
+    st.write(f"Found {len(categorical_cols)} categorical columns: {', '.join(categorical_cols)}")
+    for col in categorical_cols:
+        st.write(f"- **{col}**: {df[col].nunique()} unique values, "
+                 f"{df[col].isna().sum()} missing ({df[col].isna().sum() / len(df) * 100:.2f}%)")
+    # User configuration
+    st.markdown("### Encoding Configuration")
+    encoding_methods = {
+        "Label Encoding": "Assigns integers to categories (best for ordinal data).",
+        "One-Hot Encoding": "Creates binary columns for each category (best for non-ordinal data, avoid high cardinality).",
+        "Frequency Encoding": "Replaces categories with their frequency (useful for high-cardinality columns)."
+    }
+    # Select columns to encode
+    cols_to_encode = st.multiselect("Select Columns to Encode", categorical_cols, default=categorical_cols,
+                                   help="Choose which categorical columns to encode. Unselected columns remain unchanged.")
+    if not cols_to_encode:
+        st.warning("Please select at least one column to encode.")
+        return st.session_state.encoded_df
+    # Missing value handling
+    missing_strategy = st.selectbox("Handle Missing Values",
+                                    ["Keep as NaN", "Impute with Mode", "Impute with Custom Value"],
+                                    help="Choose how to handle missing values before encoding.")
+    custom_value = None
+    if missing_strategy == "Impute with Custom Value":
+        custom_value = st.text_input("Enter Custom Value for Missing Entries", value="Unknown")
+    # Apply missing value handling
+    encoded_df = st.session_state.encoded_df.copy()
+    for col in cols_to_encode:
+        if missing_strategy == "Impute with Mode":
+            mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
+            encoded_df[col] = df[col].fillna(mode_val)
+        elif missing_strategy == "Impute with Custom Value":
+            encoded_df[col] = df[col].fillna(custom_value)
+    # Encoding method selection per column
+    st.markdown("### Assign Encoding Methods")
+    encoding_assignments = {}
+    for col in cols_to_encode:
+        default_method = "One-Hot Encoding" if df[col].nunique() <= 10 else "Frequency Encoding"
+        encoding_assignments[col] = st.selectbox(
+            f"Encoding Method for {col}",
+            list(encoding_methods.keys()),
+            index=list(encoding_methods.keys()).index(default_method),
+            help=f"{encoding_methods[default_method]} Unique values: {df[col].nunique()}"
+        )
+    # Apply encoding
+    if st.button("Apply Encoding"):
+        try:
+            for col, method in encoding_assignments.items():
+                if method == "Label Encoding":
+                    le = LabelEncoder()
+                    # Convert to string to handle mixed types and NaNs
+                    encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))
+                    st.session_state[f"label_encoder_{col}"] = le  # Store encoder for reference
+                    st.write(f"**{col}**: Label Encoded. Classes: {list(le.classes_)}")
+                elif method == "One-Hot Encoding":
+                    if df[col].nunique() > 50:
+                        st.warning(f"**{col}** has {df[col].nunique()} unique values. One-Hot Encoding may create many columns.")
+                    # Drop NaN for one-hot encoding, reintroduce after
+                    mask = encoded_df[col].notna()
+                    ohe_df = pd.get_dummies(encoded_df.loc[mask, col], prefix=col, drop_first=True)
+                    encoded_df = pd.concat([encoded_df.drop(columns=[col]), ohe_df], axis=1)
+                    encoded_df.loc[~mask, ohe_df.columns] = np.nan
+                    st.write(f"**{col}**: One-Hot Encoded. Created {len(ohe_df.columns)} new columns.")
+                elif method == "Frequency Encoding":
+                    freq_map = df[col].value_counts(normalize=True).to_dict()
+                    encoded_df[col] = df[col].map(freq_map)
+                    st.write(f"**{col}**: Frequency Encoded. Values mapped to proportions.")
+            # Update session state
+            st.session_state.encoded_df = encoded_df
+            # Display results
+            st.markdown("### Encoded Dataset Preview")
+            st.dataframe(encoded_df.head())
+            # Data quality check
+            new_cols = len(encoded_df.columns) - len(df.columns)
+            if new_cols > 0:
+                st.info(f"Encoding added {new_cols} new columns.")
+            if encoded_df.isna().sum().sum() > 0:
+                st.warning(f"Encoded dataset still has {encoded_df.isna().sum().sum()} missing values.")
+            # Download option
+            csv_bytes = encoded_df.to_csv(index=False).encode()
+            st.download_button(
+                label="Download Encoded Dataset",
+                data=csv_bytes,
+                file_name="encoded_dataset.csv",
+                mime="text/csv",
+                key="download_encoded"
+            )
+        except Exception as e:
+            st.error(f"Error during encoding: {str(e)}")
+            return df
+    # Preview current encoded state
+    else:
+        st.markdown("### Current Dataset Preview")
+        st.dataframe(st.session_state.encoded_df.head())
+    return st.session_state.encoded_df
+# Data Transformer part eta
 def data_transformer(df):
     st.subheader("Data Transformer")
     transformed_df = df.copy()  # Placeholder for future transformations
     st.write("Transformed Dataset:", transformed_df.head())
     return transformed_df
+# Data Analysis
 def data_analysis(df):
+    """
+    Performs an in-depth analysis of the dataset, including numerical and categorical summaries,
+    interactive visualizations, data quality checks, and column-specific exploration.
+    Args:
+        df (pd.DataFrame): Input dataset to analyze.
+    """
     st.subheader("Data Analysis")
+    # Initialize tabs for different analysis types
+    tab1, tab2, tab3, tab4 = st.tabs(["Summary Statistics", "Visual Exploration", "Data Quality", "Column Deep Dive"])
+    with tab1:
+        st.markdown("### Summary Statistics")
+        # Numerical Columns
+        numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
+        if numerical_cols.size > 0:
+            st.markdown("#### Numerical Columns")
+            numerical_summary = df[numerical_cols].describe().T.round(2)
+            numerical_summary['Skewness'] = df[numerical_cols].skew().round(2)
+            numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2)
+            numerical_summary['Missing %'] = (df[numerical_cols].isna().sum() / len(df) * 100).round(2)
+            st.dataframe(numerical_summary.style.highlight_max(axis=0, color='lightgreen'))
+            st.write("*Skewness > 1 or < -1 indicates high skew. Kurtosis > 3 indicates heavy tails.*")
+        else:
+            st.info("No numerical columns found.")
+        # Categorical Columns
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+        if categorical_cols.size > 0:
+            st.markdown("#### Categorical Columns")
+            cat_summary = pd.DataFrame({
+                "Column": categorical_cols,
+                "Unique Values": [df[col].nunique() for col in categorical_cols],
+                "Most Frequent": [df[col].mode()[0] if not df[col].mode().empty else np.nan for col in categorical_cols],
+                "Missing %": [(df[col].isna().sum() / len(df) * 100).round(2) for col in categorical_cols]
+            })
+            st.dataframe(cat_summary)
+        else:
+            st.info("No categorical columns found.")
+    with tab2:
+        st.markdown("### Visual Exploration")
+        viz_type = st.selectbox("Select Visualization Type",
+                                ["Distribution (Numerical)", "Count Plot (Categorical)", "Correlation Heatmap", "Pair Plot"],
+                                key="data_analysis_viz")
+        if viz_type == "Distribution (Numerical)" and numerical_cols.size > 0:
+            col = st.selectbox("Select Column", numerical_cols, key="num_dist_col")
+            fig, ax = plt.subplots()
+            sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax)
+            ax.set_title(f"Distribution of {col}")
+            st.pyplot(fig)
+            download_image(fig, f"dist_{col}")
+            plt.close(fig)
+        elif viz_type == "Count Plot (Categorical)" and categorical_cols.size > 0:
+            col = st.selectbox("Select Column", categorical_cols, key="cat_count_col")
+            fig, ax = plt.subplots()
+            sns.countplot(data=df, x=col, ax=ax)
+            plt.xticks(rotation=45, ha='right')
+            ax.set_title(f"Count Plot of {col}")
+            st.pyplot(fig)
+            download_image(fig, f"count_{col}")
+            plt.close(fig)
+        elif viz_type == "Correlation Heatmap" and numerical_cols.size > 1:
+            fig, ax = plt.subplots(figsize=(10, 8))
+            sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax)
+            ax.set_title("Correlation Heatmap")
+            st.pyplot(fig)
+            download_image(fig, "corr_heatmap")
+            plt.close(fig)
+        elif viz_type == "Pair Plot" and numerical_cols.size > 0:
+            selected_cols = st.multiselect("Select Columns (max 4)", numerical_cols, max_selections=4, key="pair_cols")
+            if len(selected_cols) >= 2:
+                fig = sns.pairplot(df[selected_cols].dropna())
+                st.pyplot(fig)
+                img_bytes = BytesIO()
+                fig.savefig(img_bytes, format='png', bbox_inches='tight')
+                img_bytes.seek(0)
+                st.download_button(label="Download Pair Plot", data=img_bytes,
+                                  file_name=f"pairplot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
+                                  mime="image/png", key=f"pairplot_{datetime.now().strftime('%H%M%S')}")
+                plt.close()
+    with tab3:
+        st.markdown("### Data Quality Checks")
+        # Missing Values
+        missing_total = df.isna().sum().sum()
+        if missing_total > 0:
+            st.warning(f"**Missing Values**: {missing_total} across {df.isna().any().sum()} columns.")
+            missing_df = pd.DataFrame({
+                "Column": df.columns,
+                "Missing Count": df.isna().sum(),
+                "Missing %": (df.isna().sum() / len(df) * 100).round(2)
+            })
+            missing_df = missing_df[missing_df["Missing Count"] > 0]
+            st.dataframe(missing_df)
+        else:
+            st.success("No missing values detected.")
+        # Duplicates
+        duplicates = df.duplicated().sum()
+        if duplicates > 0:
+            st.warning(f"**Duplicates**: {duplicates} duplicate rows ({duplicates / len(df) * 100:.2f}%).")
+        else:
+            st.success("No duplicate rows detected.")
+        # Outliers (Numerical)
+        if numerical_cols.size > 0:
+            outlier_summary = []
+            for col in numerical_cols:
+                Q1, Q3 = df[col].quantile([0.25, 0.75])
+                IQR = Q3 - Q1
+                outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
+                if outliers > 0:
+                    outlier_summary.append({"Column": col, "Outlier Count": outliers,
+                                           "Outlier %": (outliers / len(df) * 100).round(2)})
+            if outlier_summary:
+                st.warning("**Outliers Detected**:")
+                st.dataframe(pd.DataFrame(outlier_summary))
+            else:
+                st.success("No outliers detected in numerical columns.")
+    with tab4:
+        st.markdown("### Column Deep Dive")
+        selected_col = st.selectbox("Select Column for Detailed Analysis", df.columns, key="deep_dive_col")
+        st.write(f"**Column**: {selected_col}")
+        st.write(f"**Data Type**: {df[selected_col].dtype}")
+        st.write(f"**Missing Values**: {df[selected_col].isna().sum()} ({df[selected_col].isna().sum() / len(df) * 100:.2f}%)")
+        st.write(f"**Unique Values**: {df[selected_col].nunique()} ({df[selected_col].nunique() / len(df) * 100:.2f}%)")
+        if pd.api.types.is_numeric_dtype(df[selected_col]):
+            st.write("**Summary Statistics**:")
+            stats = df[selected_col].describe().round(2)
+            stats['Skewness'] = df[selected_col].skew().round(2)
+            stats['Kurtosis'] = df[selected_col].kurt().round(2)
+            st.dataframe(stats)
+            fig = px.histogram(df, x=selected_col, nbins=30, title=f"Distribution of {selected_col}")
+            st.plotly_chart(fig)
+        elif pd.api.types.is_object_dtype(df[selected_col]) or pd.api.types.is_categorical_dtype(df[selected_col]):
+            st.write("**Top 5 Values**:")
+            value_counts = df[selected_col].value_counts().head(5)
+            st.dataframe(pd.DataFrame({
+                "Value": value_counts.index,
+                "Count": value_counts.values,
+                "% of Total": (value_counts.values / len(df) * 100).round(2)
+            }))
+            fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {selected_col}")
+            st.plotly_chart(fig)
+def download_image(fig, key_prefix):
+    """
+    Utility function to download a Matplotlib figure as PNG.
+    """
+    img_bytes = BytesIO()
+    fig.savefig(img_bytes, format='png', bbox_inches='tight')
+    img_bytes.seek(0)
+    st.download_button(label="Download Image", data=img_bytes,
+                      file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png",
+                      mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}")
+# Feature Importance Analysis
 def feature_importance_analyzer(df):
     st.subheader("Feature Importance Analyzer")
     target_column = st.selectbox("Select Target Column", df.columns)