import streamlit as st import pandas as pd from faker import Faker import random from groq import Groq from io import BytesIO import ast import re import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import io from datetime import datetime from sklearn.preprocessing import LabelEncoder import matplotlib from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.svm import SVC, SVR from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor import numpy as np from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score) # For NLP Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from gensim import corpora from gensim.models import LdaModel import spacy import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # Download NLTK resources import nltk import os nltk.data.path.append(os.path.join(os.path.dirname(__file__), 'nltk_data')) try: nltk.data.find('tokenizers/punkt_tab') nltk.data.find('corpora/stopwords') nltk.data.find('corpora/wordnet') except LookupError: nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') # Ensure spaCy model try: spacy.load("en_core_web_sm") except OSError: import os os.system("python -m spacy download en_core_web_sm") # Conditional import for time series models try: from statsmodels.tsa.holtwinters import ExponentialSmoothing from statsmodels.tsa.arima.model import ARIMA HAS_STATSMODELS = True except ImportError: HAS_STATSMODELS = False # Set matplotlib backend for Streamlit compatibility matplotlib.use('Agg') # Initialize Faker and apply custom styles fake = Faker() def add_custom_styles(): st.markdown( """ """, unsafe_allow_html=True ) def add_header(): st.markdown( """ """, unsafe_allow_html=True ) st.markdown("### Upload Your Dataset for Preprocessing, Training, and EDA") uploaded_file = st.file_uploader("Upload CSV", type="csv") if uploaded_file: try: df = pd.read_csv(uploaded_file) st.success("Dataset uploaded successfully!") st.session_state['uploaded_df'] = df st.write("Preview of the uploaded dataset:") st.dataframe(df.head()) except Exception as e: st.error(f"Error loading CSV file: {str(e)}") else: st.info("Upload a CSV file to get started. And Go to the Sidebar to start working on your dataset") def add_footer(): st.markdown( """ """, unsafe_allow_html=True ) def add_sidebar(): st.sidebar.image( "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png", width=150, caption="DataGenie" ) st.sidebar.markdown("---") st.sidebar.title("About DataGenie") st.sidebar.info( "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration." ) st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher") st.sidebar.write("**Inspired by:** Predicta by Ahammad Nafiz") st.sidebar.markdown("---") st.sidebar.write("**Your**") st.sidebar.image( "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png", width=150 ) # App configuration APP_NAME = "DataGenie" # Initialize Groq client with API key GROQ_API_KEY = "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" try: client = Groq(api_key=GROQ_API_KEY) except Exception as e: st.error(f"Invalid Groq API key: {str(e)}. Please update GROQ_API_KEY.") st.stop() # Utility functions def extract_row_count(prompt): match = re.search(r'(\d+)\s*(rows|records|entries)', prompt, re.IGNORECASE) return int(match.group(1)) if match else 100 def generate_dataset_code(prompt): try: chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": ( "You are an expert Python code generator specializing in creating synthetic datasets using pandas, faker, and random. " "Based on the user's natural language prompt, generate a valid Python function named `create_dataset()` that returns a pandas DataFrame. " "Follow these strict rules:\n" "1. The function must start exactly with `def create_dataset():` and take no arguments.\n" "2. Use only `pd` (pandas), `fake` (Faker), and `random` (random module) within the function.\n" "3. Extract the number of rows from the prompt (e.g., '500 rows' or '1000 records') and use `range()` to generate exactly that many rows. If no row count is specified, default to 100 rows.\n" "4. Generate realistic data for all columns specified in the prompt, respecting any domain-specific details (e.g., age between 18-80, prices in USD, regional names).\n" "5. For target columns (e.g., 'yes/no', 'percentage', 'price', 'category'), use appropriate distributions or logic (e.g., random.choice(['Yes', 'No']), random.uniform(0, 100) for percentages).\n" "6. Ensure data types are correct: integers for counts, floats for percentages/prices, strings for names/emails, etc.\n" "7. The function must end with `return pd.DataFrame(data)` where `data` is a dictionary of column lists.\n" "8. Do not include comments, markdown, explanations, or extra text outside the function definition.\n" "Example for prompt 'Generate 200 rows of customer data with name, age, email, and purchase_amount':\n" "def create_dataset():\n" " data = {\n" " 'name': [fake.name() for _ in range(200)],\n" " 'age': [random.randint(18, 80) for _ in range(200)],\n" " 'email': [fake.email() for _ in range(200)],\n" " 'purchase_amount': [round(random.uniform(10.0, 500.0), 2) for _ in range(200)]\n" " }\n" " return pd.DataFrame(data)\n" "Handle edge cases gracefully, such as missing column details, by using reasonable defaults. " "Ensure the code is syntactically correct and executable. Remember, in case of classification yes means 1 and no means 0." ), }, {"role": "user", "content": prompt}, ], model="llama-3.3-70b-versatile", ) code = chat_completion.choices[0].message.content.strip() if not code.startswith("def create_dataset():"): st.error("Generated code does not define create_dataset function correctly.") st.code(code, language="python") return None try: ast.parse(code) return code except SyntaxError as e: st.error(f"Invalid syntax in generated code: {str(e)}") st.code(code, language="python") return None except Exception as e: st.error(f"Error with Groq API: {str(e)}") return None def execute_code(code): safe_globals = { "pd": pd, "fake": fake, "random": random, "__builtins__": { "range": range, "list": list, "int": int, "str": str, "float": float, "round": round, "True": True, "False": False, "zip": zip, }, } safe_locals = {} try: exec(code, safe_globals, safe_locals) create_dataset = safe_locals.get("create_dataset") if not create_dataset: st.error("No create_dataset function defined.") return None df = create_dataset() if not isinstance(df, pd.DataFrame): st.error("Generated code did not return a pandas DataFrame.") return None return df except Exception as e: st.error(f"Execution error: {str(e)}") return None def to_csv_bytes(df): output = BytesIO() df.to_csv(output, index=False) output.seek(0) return output # Visualization functions def visualize_dataset(df): st.subheader("Dataset Visualizations") if df.empty or not isinstance(df, pd.DataFrame): st.warning("No valid data to visualize.") return numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() all_cols = numerical_cols + categorical_cols + datetime_cols if not all_cols: st.warning("No columns available to visualize.") return viz_type = st.sidebar.selectbox("Select Visualization Type", ["Histogram", "Box Plot", "Scatter Plot", "Count Plot", "Correlation Heatmap"] + (["Time Series"] if datetime_cols and numerical_cols else [])) plt.clf() try: if viz_type == "Histogram" and numerical_cols: col = st.sidebar.selectbox("Select Numerical Column", numerical_cols) fig, ax = plt.subplots() sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax) st.pyplot(fig) download_image(fig, f"histogram_{col}") plt.close(fig) elif viz_type == "Box Plot" and numerical_cols: col = st.sidebar.selectbox("Select Numerical Column", numerical_cols) fig, ax = plt.subplots() sns.boxplot(data=df, y=col, ax=ax) st.pyplot(fig) download_image(fig, f"boxplot_{col}") plt.close(fig) elif viz_type == "Scatter Plot" and len(numerical_cols) >= 2: x_col = st.sidebar.selectbox("Select X-axis Column", numerical_cols) y_col = st.sidebar.selectbox("Select Y-axis Column", [c for c in numerical_cols if c != x_col]) fig = px.scatter(df, x=x_col, y=y_col) st.plotly_chart(fig) img_bytes = io.BytesIO() fig.write_image(img_bytes, format='png') st.sidebar.download_button("Download Scatter Plot", img_bytes.getvalue(), file_name=f"scatter_{x_col}_{y_col}.png", key=f"scatter_{x_col}_{y_col}_{datetime.now().strftime('%H%M%S')}") elif viz_type == "Count Plot" and categorical_cols: col = st.sidebar.selectbox("Select Categorical Column", categorical_cols) fig, ax = plt.subplots() sns.countplot(data=df, x=col, ax=ax) plt.xticks(rotation=45, ha='right') st.pyplot(fig) download_image(fig, f"countplot_{col}") plt.close(fig) elif viz_type == "Correlation Heatmap" and numerical_cols: fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax) st.pyplot(fig) download_image(fig, "correlation_heatmap") plt.close(fig) elif viz_type == "Time Series" and datetime_cols and numerical_cols: datetime_col = st.sidebar.selectbox("Select Datetime Column", datetime_cols) value_col = st.sidebar.selectbox("Select Value Column", numerical_cols) df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce') fig = px.line(df, x=datetime_col, y=value_col) st.plotly_chart(fig) img_bytes = io.BytesIO() fig.write_image(img_bytes, format='png') st.sidebar.download_button("Download Time Series", img_bytes.getvalue(), file_name=f"time_series_{datetime_col}_{value_col}.png", key=f"timeseries_{datetime_col}_{value_col}_{datetime.now().strftime('%H%M%S')}") except Exception as e: st.error(f"Visualization error: {str(e)}") def visualize_specific_features(df, features): st.subheader("Feature-Specific Visualizations") for feature in features: if feature not in df.columns: st.warning(f"Feature '{feature}' not found.") continue fig, ax = plt.subplots() try: if pd.api.types.is_numeric_dtype(df[feature]): sns.histplot(data=df, x=feature, kde=True, bins='auto', ax=ax) elif pd.api.types.is_categorical_dtype(df[feature]) or pd.api.types.is_string_dtype(df[feature]): sns.countplot(data=df, x=feature, ax=ax) plt.xticks(rotation=45, ha='right') elif pd.api.types.is_datetime64_any_dtype(df[feature]): st.warning(f"Use 'Time Series' in main visualization for '{feature}'.") plt.close(fig) continue st.pyplot(fig) download_image(fig, f"feature_{feature}") plt.close(fig) except Exception as e: st.error(f"Error visualizing '{feature}': {str(e)}") plt.close(fig) def download_image(fig, key_prefix): img_bytes = io.BytesIO() fig.savefig(img_bytes, format='png', bbox_inches='tight') img_bytes.seek(0) st.sidebar.download_button(label="Download Image", data=img_bytes, file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}") # Data processing functions def dataset_overview(df): st.subheader("Dataset Overview") st.markdown("#### Basic Information") st.write(f"**Rows**: {len(df):,} | **Columns**: {len(df.columns):,}") st.write(f"**Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") st.markdown("#### Data Types and Missing Values") dtypes_df = pd.DataFrame({ "Column": df.columns, "Data Type": df.dtypes, "Non-Null Count": df.count(), "Missing Values": df.isna().sum(), "Missing %": (df.isna().sum() / len(df) * 100).round(2) }).reset_index(drop=True) st.dataframe(dtypes_df.style.highlight_null(color='lightcoral')) st.markdown("#### Numerical Columns Summary") numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns if numerical_cols.size > 0: numerical_summary = df[numerical_cols].describe().T.round(2) numerical_summary['Skewness'] = df[numerical_cols].skew().round(2) numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2) st.dataframe(numerical_summary) st.markdown("#### Categorical Columns Summary") categorical_cols = df.select_dtypes(include=['object', 'category']).columns if categorical_cols.size > 0: for col in categorical_cols: value_counts = df[col].value_counts().head(5) st.write(f"**{col}** (Top 5 values):") st.dataframe(pd.DataFrame({ "Value": value_counts.index, "Count": value_counts.values, "% of Total": (value_counts.values / len(df) * 100).round(2) })) st.markdown("#### Duplicate Rows") duplicate_count = df.duplicated().sum() if duplicate_count > 0: st.warning(f"Found {duplicate_count} duplicate rows ({duplicate_count / len(df) * 100:.2f}% of total).") else: st.success("No duplicate rows detected.") st.markdown("#### Sample Data (First 5 Rows)") st.dataframe(df.head()) def clean_data(df): st.subheader("Clean Data") st.markdown("#### Missing Values") missing_values = df.isnull().sum() missing_percentage = (missing_values / len(df) * 100).round(2) missing_summary = pd.DataFrame({ "Missing Values": missing_values, "Missing Percentage (%)": missing_percentage }).sort_values(by="Missing Values", ascending=False) st.dataframe(missing_summary) st.markdown("#### Duplicate Rows") duplicate_count = df.duplicated().sum() if duplicate_count > 0: st.warning(f"Found {duplicate_count} duplicate rows. They will be removed.") else: st.success("No duplicate rows detected.") cleaned_df = df.dropna().drop_duplicates() st.write(f"Cleaned Dataset: {len(cleaned_df)} rows remaining after cleaning.") st.dataframe(cleaned_df.head()) return cleaned_df def detect_outlier(df): st.subheader("Detect Outliers") numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns if not numerical_cols.any(): st.warning("No numerical columns available for outlier detection.") return st.markdown("#### Outlier Detection Summary") outlier_summary = [] for col in numerical_cols: Q1, Q3 = df[col].quantile([0.25, 0.75]) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] outlier_count = len(outliers) outlier_percentage = round((outlier_count / len(df) * 100), 2) outlier_summary.append({ "Column": col, "Outliers": outlier_count, "Outlier Percentage (%)": outlier_percentage }) outlier_df = pd.DataFrame(outlier_summary).sort_values(by="Outliers", ascending=False) st.dataframe(outlier_df) st.markdown("#### Outlier Visualization") selected_col = st.selectbox("Select a column to visualize outliers", numerical_cols) if selected_col: Q1, Q3 = df[selected_col].quantile([0.25, 0.75]) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR fig, ax = plt.subplots() sns.boxplot(data=df, x=selected_col, ax=ax) ax.axhline(lower_bound, color='red', linestyle='--', label='Lower Bound') ax.axhline(upper_bound, color='blue', linestyle='--', label='Upper Bound') ax.legend() st.pyplot(fig) plt.close(fig) # Data Encoder def encoder(df): """ Encodes categorical columns in the dataset using user-selected methods (Label Encoding, One-Hot Encoding, or Frequency Encoding). Provides control over column selection, handles missing values, and displays encoding details. Args: df (pd.DataFrame): Input dataset to encode. Returns: pd.DataFrame: Encoded dataset. """ st.subheader("Encode Data") # Initialize session state for encoded DataFrame if 'encoded_df' not in st.session_state: st.session_state.encoded_df = df.copy() # Identify categorical columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() if not categorical_cols: st.warning("No categorical columns ('object' or 'category') found in the dataset.") return df # Display original categorical columns st.markdown("### Categorical Columns Detected") st.write(f"Found {len(categorical_cols)} categorical columns: {', '.join(categorical_cols)}") for col in categorical_cols: st.write(f"- **{col}**: {df[col].nunique()} unique values, " f"{df[col].isna().sum()} missing ({df[col].isna().sum() / len(df) * 100:.2f}%)") # User configuration st.markdown("### Encoding Configuration") encoding_methods = { "Label Encoding": "Assigns integers to categories (best for ordinal data).", "One-Hot Encoding": "Creates binary columns for each category (best for non-ordinal data, avoid high cardinality).", "Frequency Encoding": "Replaces categories with their frequency (useful for high-cardinality columns)." } # Select columns to encode cols_to_encode = st.multiselect("Select Columns to Encode", categorical_cols, default=categorical_cols, help="Choose which categorical columns to encode. Unselected columns remain unchanged.") if not cols_to_encode: st.warning("Please select at least one column to encode.") return st.session_state.encoded_df # Missing value handling missing_strategy = st.selectbox("Handle Missing Values", ["Keep as NaN", "Impute with Mode", "Impute with Custom Value"], help="Choose how to handle missing values before encoding.") custom_value = None if missing_strategy == "Impute with Custom Value": custom_value = st.text_input("Enter Custom Value for Missing Entries", value="Unknown") # Apply missing value handling encoded_df = st.session_state.encoded_df.copy() for col in cols_to_encode: if missing_strategy == "Impute with Mode": mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown" encoded_df[col] = df[col].fillna(mode_val) elif missing_strategy == "Impute with Custom Value": encoded_df[col] = df[col].fillna(custom_value) # Encoding method selection per column st.markdown("### Assign Encoding Methods") encoding_assignments = {} for col in cols_to_encode: default_method = "One-Hot Encoding" if df[col].nunique() <= 10 else "Frequency Encoding" encoding_assignments[col] = st.selectbox( f"Encoding Method for {col}", list(encoding_methods.keys()), index=list(encoding_methods.keys()).index(default_method), help=f"{encoding_methods[default_method]} Unique values: {df[col].nunique()}" ) # Apply encoding if st.button("Apply Encoding"): try: for col, method in encoding_assignments.items(): if method == "Label Encoding": le = LabelEncoder() # Convert to string to handle mixed types and NaNs encoded_df[col] = le.fit_transform(encoded_df[col].astype(str)) st.session_state[f"label_encoder_{col}"] = le # Store encoder for reference st.write(f"**{col}**: Label Encoded. Classes: {list(le.classes_)}") elif method == "One-Hot Encoding": if df[col].nunique() > 50: st.warning(f"**{col}** has {df[col].nunique()} unique values. One-Hot Encoding may create many columns.") # Drop NaN for one-hot encoding, reintroduce after mask = encoded_df[col].notna() ohe_df = pd.get_dummies(encoded_df.loc[mask, col], prefix=col, drop_first=True) encoded_df = pd.concat([encoded_df.drop(columns=[col]), ohe_df], axis=1) encoded_df.loc[~mask, ohe_df.columns] = np.nan st.write(f"**{col}**: One-Hot Encoded. Created {len(ohe_df.columns)} new columns.") elif method == "Frequency Encoding": freq_map = df[col].value_counts(normalize=True).to_dict() encoded_df[col] = df[col].map(freq_map) st.write(f"**{col}**: Frequency Encoded. Values mapped to proportions.") # Update session state st.session_state.encoded_df = encoded_df # Display results st.markdown("### Encoded Dataset Preview") st.dataframe(encoded_df.head()) # Data quality check new_cols = len(encoded_df.columns) - len(df.columns) if new_cols > 0: st.info(f"Encoding added {new_cols} new columns.") if encoded_df.isna().sum().sum() > 0: st.warning(f"Encoded dataset still has {encoded_df.isna().sum().sum()} missing values.") # Download option csv_bytes = encoded_df.to_csv(index=False).encode() st.download_button( label="Download Encoded Dataset", data=csv_bytes, file_name="encoded_dataset.csv", mime="text/csv", key="download_encoded" ) except Exception as e: st.error(f"Error during encoding: {str(e)}") return df # Preview current encoded state else: st.markdown("### Current Dataset Preview") st.dataframe(st.session_state.encoded_df.head()) return st.session_state.encoded_df # Data Transformer part eta def data_transformer(df): st.subheader("Data Transformer") transformed_df = df.copy() # Placeholder for future transformations st.write("Transformed Dataset:", transformed_df.head()) return transformed_df # Data Analysis def data_analysis(df): """ Performs an in-depth analysis of the dataset, including numerical and categorical summaries, interactive visualizations, data quality checks, and column-specific exploration. Args: df (pd.DataFrame): Input dataset to analyze. """ st.subheader("Data Analysis") # Initialize tabs for different analysis types tab1, tab2, tab3, tab4 = st.tabs(["Summary Statistics", "Visual Exploration", "Data Quality", "Column Deep Dive"]) with tab1: st.markdown("### Summary Statistics") # Numerical Columns numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns if numerical_cols.size > 0: st.markdown("#### Numerical Columns") numerical_summary = df[numerical_cols].describe().T.round(2) numerical_summary['Skewness'] = df[numerical_cols].skew().round(2) numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2) numerical_summary['Missing %'] = (df[numerical_cols].isna().sum() / len(df) * 100).round(2) st.dataframe(numerical_summary.style.highlight_max(axis=0, color='lightgreen')) st.write("*Skewness > 1 or < -1 indicates high skew. Kurtosis > 3 indicates heavy tails.*") else: st.info("No numerical columns found.") # Categorical Columns categorical_cols = df.select_dtypes(include=['object', 'category']).columns if categorical_cols.size > 0: st.markdown("#### Categorical Columns") cat_summary = pd.DataFrame({ "Column": categorical_cols, "Unique Values": [df[col].nunique() for col in categorical_cols], "Most Frequent": [df[col].mode()[0] if not df[col].mode().empty else np.nan for col in categorical_cols], "Missing %": [(df[col].isna().sum() / len(df) * 100).round(2) for col in categorical_cols] }) st.dataframe(cat_summary) else: st.info("No categorical columns found.") with tab2: st.markdown("### Visual Exploration") viz_type = st.selectbox("Select Visualization Type", ["Distribution (Numerical)", "Count Plot (Categorical)", "Correlation Heatmap", "Pair Plot"], key="data_analysis_viz") if viz_type == "Distribution (Numerical)" and numerical_cols.size > 0: col = st.selectbox("Select Column", numerical_cols, key="num_dist_col") fig, ax = plt.subplots() sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax) ax.set_title(f"Distribution of {col}") st.pyplot(fig) download_image(fig, f"dist_{col}") plt.close(fig) elif viz_type == "Count Plot (Categorical)" and categorical_cols.size > 0: col = st.selectbox("Select Column", categorical_cols, key="cat_count_col") fig, ax = plt.subplots() sns.countplot(data=df, x=col, ax=ax) plt.xticks(rotation=45, ha='right') ax.set_title(f"Count Plot of {col}") st.pyplot(fig) download_image(fig, f"count_{col}") plt.close(fig) elif viz_type == "Correlation Heatmap" and numerical_cols.size > 1: fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax) ax.set_title("Correlation Heatmap") st.pyplot(fig) download_image(fig, "corr_heatmap") plt.close(fig) elif viz_type == "Pair Plot" and numerical_cols.size > 0: selected_cols = st.multiselect("Select Columns (max 4)", numerical_cols, max_selections=4, key="pair_cols") if len(selected_cols) >= 2: fig = sns.pairplot(df[selected_cols].dropna()) st.pyplot(fig) img_bytes = BytesIO() fig.savefig(img_bytes, format='png', bbox_inches='tight') img_bytes.seek(0) st.download_button(label="Download Pair Plot", data=img_bytes, file_name=f"pairplot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", mime="image/png", key=f"pairplot_{datetime.now().strftime('%H%M%S')}") plt.close() with tab3: st.markdown("### Data Quality Checks") # Missing Values missing_total = df.isna().sum().sum() if missing_total > 0: st.warning(f"**Missing Values**: {missing_total} across {df.isna().any().sum()} columns.") missing_df = pd.DataFrame({ "Column": df.columns, "Missing Count": df.isna().sum(), "Missing %": (df.isna().sum() / len(df) * 100).round(2) }) missing_df = missing_df[missing_df["Missing Count"] > 0] st.dataframe(missing_df) else: st.success("No missing values detected.") # Duplicates duplicates = df.duplicated().sum() if duplicates > 0: st.warning(f"**Duplicates**: {duplicates} duplicate rows ({duplicates / len(df) * 100:.2f}%).") else: st.success("No duplicate rows detected.") # Outliers (Numerical) if numerical_cols.size > 0: outlier_summary = [] for col in numerical_cols: Q1, Q3 = df[col].quantile([0.25, 0.75]) IQR = Q3 - Q1 outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum() if outliers > 0: outlier_summary.append({"Column": col, "Outlier Count": outliers, "Outlier %": (outliers / len(df) * 100).round(2)}) if outlier_summary: st.warning("**Outliers Detected**:") st.dataframe(pd.DataFrame(outlier_summary)) else: st.success("No outliers detected in numerical columns.") with tab4: st.markdown("### Column Deep Dive") selected_col = st.selectbox("Select Column for Detailed Analysis", df.columns, key="deep_dive_col") st.write(f"**Column**: {selected_col}") st.write(f"**Data Type**: {df[selected_col].dtype}") st.write(f"**Missing Values**: {df[selected_col].isna().sum()} ({df[selected_col].isna().sum() / len(df) * 100:.2f}%)") st.write(f"**Unique Values**: {df[selected_col].nunique()} ({df[selected_col].nunique() / len(df) * 100:.2f}%)") if pd.api.types.is_numeric_dtype(df[selected_col]): st.write("**Summary Statistics**:") stats = df[selected_col].describe().round(2) stats['Skewness'] = df[selected_col].skew().round(2) stats['Kurtosis'] = df[selected_col].kurt().round(2) st.dataframe(stats) fig = px.histogram(df, x=selected_col, nbins=30, title=f"Distribution of {selected_col}") st.plotly_chart(fig) elif pd.api.types.is_object_dtype(df[selected_col]) or pd.api.types.is_categorical_dtype(df[selected_col]): st.write("**Top 5 Values**:") value_counts = df[selected_col].value_counts().head(5) st.dataframe(pd.DataFrame({ "Value": value_counts.index, "Count": value_counts.values, "% of Total": (value_counts.values / len(df) * 100).round(2) })) fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {selected_col}") st.plotly_chart(fig) def download_image(fig, key_prefix): """ Utility function to download a Matplotlib figure as PNG. """ img_bytes = BytesIO() fig.savefig(img_bytes, format='png', bbox_inches='tight') img_bytes.seek(0) st.download_button(label="Download Image", data=img_bytes, file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}") # Feature Importance Analysis def feature_importance_analyzer(df): st.subheader("Feature Importance Analyzer") target_column = st.selectbox("Select Target Column", df.columns) feature_columns = [col for col in df.columns if col != target_column] if not feature_columns: st.warning("No features available.") return X = pd.get_dummies(df[feature_columns], drop_first=True) y = df[target_column] if y.dtype in ['object', 'category']: y = LabelEncoder().fit_transform(y) try: model = RandomForestClassifier(random_state=42) if y.nunique() <= 10 else RandomForestRegressor(random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model.fit(X_train, y_train) importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_}).sort_values(by="Importance", ascending=False) st.write("Feature Importances:", importance_df) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis", ax=ax) st.pyplot(fig) download_image(fig, "feature_importance") plt.close(fig) except Exception as e: st.error(f"Error analyzing features: {str(e)}") #Best Parameter Selector def best_parameter_selector(df): st.subheader("Best Parameter Selector") task_type = st.selectbox("Select Task Type", ["Classification", "Regression"]) target_column = st.selectbox("Select Target Column", df.columns) feature_columns = [col for col in df.columns if col != target_column] if not feature_columns: st.warning("No features available.") return X = pd.get_dummies(df[feature_columns], drop_first=True) y = df[target_column] if task_type == "Classification" and y.dtype in ['object', 'category']: y = LabelEncoder().fit_transform(y) model_options = { "Classification": { "Logistic Regression": (LogisticRegression, {"C": [0.01, 0.1, 1], "max_iter": [100, 200]}), "Random Forest": (RandomForestClassifier, {"n_estimators": [50, 100], "max_depth": [None, 10]}), "SVM": (SVC, {"C": [0.1, 1], "kernel": ["rbf", "linear"]}) }, "Regression": { "Linear Regression": (LinearRegression, {}), "Random Forest": (RandomForestRegressor, {"n_estimators": [50, 100], "max_depth": [None, 10]}), "SVR": (SVR, {"C": [0.1, 1], "epsilon": [0.1, 0.2]}) } } model_name = st.selectbox("Select Model", list(model_options[task_type].keys())) model_class, param_grid = model_options[task_type][model_name] model = model_class(random_state=42) if "random_state" in model_class.__init__.__code__.co_varnames else model_class() for param, values in param_grid.items(): new_values = st.text_input(f"Values for {param} (comma-separated)", ",".join(map(str, values)) if values else "") if new_values: param_grid[param] = [float(x) if '.' in x else int(x) for x in new_values.split(',')] scoring = st.selectbox("Select Scoring Metric", ["accuracy", "f1"] if task_type == "Classification" else ["r2", "neg_mean_squared_error"]) try: if param_grid: with st.spinner("Performing GridSearchCV..."): grid_search = GridSearchCV(model, param_grid, cv=3, scoring=scoring, n_jobs=-1) grid_search.fit(X, y) st.write("Best Parameters:", grid_search.best_params_) st.write("Best Score:", grid_search.best_score_) else: model.fit(X, y) st.write("Model trained with default parameters. Score:", model.score(X, y)) except Exception as e: st.error(f"Parameter selection error: {str(e)}") # Select ML Models import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, ExtraTreesRegressor from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.naive_bayes import GaussianNB from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, v_measure_score) from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from io import BytesIO import pickle import warnings warnings.filterwarnings("ignore", category=UserWarning) # Suppress sklearn warnings # Time series imports try: from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.holtwinters import ExponentialSmoothing from prophet import Prophet HAS_STATSMODELS = True HAS_PROPHET = True except ImportError: HAS_STATSMODELS = False HAS_PROPHET = False def select_ml_models(df): """ Builds an end-to-end ML workflow for classification, regression, clustering, or time series forecasting. Includes preprocessing, model training, evaluation, hyperparameter tuning, and model saving. Args: df (pd.DataFrame): Input dataset as a pandas DataFrame. """ st.subheader("Machine Learning Workflow") # Select ML task analysis_type = st.selectbox("Select Machine Learning Task", ["Classification", "Regression", "Clustering", "Time Series Forecasting"], help="Choose the type of ML task to perform.") # Preprocessing function def preprocess_data(df, target_col=None, task_type=None): """ Preprocesses the dataset based on the ML task. Args: df (pd.DataFrame): Input dataset. target_col (str): Target column name (None for clustering). task_type (str): Type of ML task. Returns: tuple: Preprocessed features (X), target (y), and preprocessor (if applicable). """ if target_col: X = df.drop(columns=[target_col]) y = df[target_col] else: X = df y = None # Identify numerical and categorical columns numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() # Define preprocessing pipeline numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols) ]) if task_type in ["Classification", "Regression"]: # Fit and transform features X_processed = preprocessor.fit_transform(X) # Handle target for classification if task_type == "Classification" and y.dtype in ['object', 'category']: le = LabelEncoder() y = le.fit_transform(y) return X_processed, y, preprocessor, le return X_processed, y, preprocessor, None elif task_type == "Clustering": X_processed = preprocessor.fit_transform(X) return X_processed, None, preprocessor, None elif task_type == "Time Series Forecasting": # Time series requires minimal preprocessing here return X, y, None, None # Model evaluation function def evaluate_model(model, X_test, y_test, task_type, y_pred=None): """ Evaluates the model using task-specific metrics. Args: model: Trained model. X_test: Test features. y_test: Test target. task_type: Type of ML task. y_pred: Predicted values (optional, computed if None). Returns: dict: Evaluation metrics. """ if y_pred is None: y_pred = model.predict(X_test) if task_type != "Time Series Forecasting" else model.forecast(len(y_test)) metrics = {} if task_type == "Classification": metrics.update({ "Accuracy": accuracy_score(y_test, y_pred), "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0), "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0), "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0), "ROC AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1], multi_class='ovr') if hasattr(model, "predict_proba") else np.nan }) # Confusion matrix cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_title("Confusion Matrix") ax.set_xlabel("Predicted") ax.set_ylabel("True") st.pyplot(fig) plt.close(fig) elif task_type == "Regression": metrics.update({ "MAE": mean_absolute_error(y_test, y_pred), "MSE": mean_squared_error(y_test, y_pred), "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)), "R²": r2_score(y_test, y_pred), "MAPE": mean_absolute_percentage_error(y_test, y_pred) * 100 }) # Scatter plot of predictions fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'True Values', 'y': 'Predicted Values'}, title="True vs Predicted Values") fig.add_scatter(x=y_test, y=y_test, mode='lines', name='Ideal') st.plotly_chart(fig) elif task_type == "Clustering": metrics.update({ "Silhouette Score": silhouette_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan, "Davies-Bouldin Index": davies_bouldin_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan, "Calinski-Harabasz Score": calinski_harabasz_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan }) # Visualize clusters (if 2D or reducible) if X_test.shape[1] == 2: fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred.astype(str), title="Cluster Visualization", labels={'x': 'Feature 1', 'y': 'Feature 2'}) st.plotly_chart(fig) elif task_type == "Time Series Forecasting": metrics.update({ "MAE": mean_absolute_error(y_test, y_pred), "MSE": mean_squared_error(y_test, y_pred), "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)), "MAPE": mean_absolute_percentage_error(y_test, y_pred) * 100 }) return metrics # Hyperparameter tuning function def tune_model(model, X_train, y_train, task_type, model_name): """ Performs hyperparameter tuning using RandomizedSearchCV. Args: model: Model to tune. X_train: Training features. y_train: Training target. task_type: Type of ML task. model_name: Name of the model. Returns: tuple: Best model and parameters. """ param_grids = { "Logistic Regression": {"C": np.logspace(-3, 3, 10), "penalty": ['l2'], "max_iter": [1000]}, "Random Forest Classifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20], "min_samples_split": [2, 5]}, "SVM Classifier": {"C": [0.1, 1, 10], "kernel": ['rbf', 'linear']}, "KNN Classifier": {"n_neighbors": [3, 5, 7, 9], "weights": ['uniform', 'distance']}, "Gradient Boosting Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}, "Naive Bayes": {}, # No tuning for Naive Bayes "AdaBoost Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, "Extra Trees Classifier": {"n_estimators": [50, 100], "max_depth": [None, 10, 20]}, "Linear Regression": {}, # No tuning for basic Linear Regression "Ridge Regression": {"alpha": [0.1, 1, 10, 100]}, "Lasso Regression": {"alpha": [0.1, 1, 10, 100]}, "Random Forest Regressor": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}, "SVR": {"C": [0.1, 1, 10], "epsilon": [0.1, 0.2, 0.5]}, "KNN Regressor": {"n_neighbors": [3, 5, 7, 9], "weights": ['uniform', 'distance']}, "Gradient Boosting Regressor": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}, "AdaBoost Regressor": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, "Extra Trees Regressor": {"n_estimators": [50, 100], "max_depth": [None, 10, 20]}, "K-Means": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, "DBSCAN": {"eps": [0.1, 0.5, 1.0], "min_samples": [3, 5, 10]}, "Agglomerative Clustering": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, "Spectral Clustering": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, "OPTICS": {"min_samples": [3, 5, 10], "xi": [0.05, 0.1]}, "ARIMA": {"order": [(1,1,1), (2,1,1), (1,1,2)]}, "Exponential Smoothing": {"trend": ["add", None], "seasonal": ["add", None]}, "Prophet": {"changepoint_prior_scale": [0.01, 0.05, 0.1], "seasonality_prior_scale": [5, 10, 15]} } if model_name in param_grids and param_grids[model_name]: search = RandomizedSearchCV(model, param_distributions=param_grids[model_name], n_iter=10, cv=3, scoring='accuracy' if task_type == "Classification" else 'r2', n_jobs=-1, random_state=42) search.fit(X_train, y_train) return search.best_estimator_, search.best_params_ return model, {} # Model saving function def save_model(model, model_name): """ Serializes and offers the model for download. Args: model: Trained model. model_name: Name of the model. Returns: BytesIO: Serialized model file. """ model_file = BytesIO() pickle.dump(model, model_file) model_file.seek(0) return model_file # Classification and Regression if analysis_type in ["Classification", "Regression"]: st.markdown("### Configure Model") target_col = st.selectbox("Select Target Variable", df.columns, key="target_col") feature_cols = st.multiselect("Select Feature Columns", [col for col in df.columns if col != target_col], key="feature_cols") if not feature_cols: st.warning("Please select at least one feature column.") return # Validate target if analysis_type == "Classification": if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > len(df) // 10: st.error("Target appears continuous. Consider binning or switching to Regression.") return elif analysis_type == "Regression": if not pd.api.types.is_numeric_dtype(df[target_col]): st.error("Target must be numeric for regression.") return # Preprocess data try: X, y, preprocessor, le = preprocess_data(df[feature_cols + [target_col]], target_col, analysis_type) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) except Exception as e: st.error(f"Preprocessing error: {str(e)}") return # Model options model_options = { "Classification": { "Logistic Regression": LogisticRegression(random_state=42), "Random Forest Classifier": RandomForestClassifier(random_state=42), "SVM Classifier": SVC(random_state=42, probability=True), "KNN Classifier": KNeighborsClassifier(), "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42), "Naive Bayes": GaussianNB(), "AdaBoost Classifier": AdaBoostClassifier(random_state=42), "Extra Trees Classifier": ExtraTreesClassifier(random_state=42) }, "Regression": { "Linear Regression": LinearRegression(), "Ridge Regression": Ridge(random_state=42), "Lasso Regression": Lasso(random_state=42), "Random Forest Regressor": RandomForestRegressor(random_state=42), "SVR": SVR(), "KNN Regressor": KNeighborsRegressor(), "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42), "AdaBoost Regressor": AdaBoostRegressor(random_state=42), "Extra Trees Regressor": ExtraTreesRegressor(random_state=42) } }[analysis_type] selected_model_name = st.selectbox("Select Model", list(model_options.keys()), key="model_select") model = model_options[selected_model_name] # Train model if st.button("Train Model", key="train_button"): with st.spinner("Training model..."): try: model.fit(X_train, y_train) y_pred = model.predict(X_test) metrics = evaluate_model(model, X_test, y_test, analysis_type, y_pred) st.markdown("### Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) # Cross-validation cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy' if analysis_type == "Classification" else 'r2') st.write(f"**Cross-Validation Scores** (5-fold): Mean = {cv_scores.mean():.4f}, Std = {cv_scores.std():.4f}") # Store model in session state st.session_state['trained_model'] = model st.session_state['model_name'] = selected_model_name except Exception as e: st.error(f"Training error: {str(e)}") return # Hyperparameter tuning if st.button("Perform Hyperparameter Tuning", key="tune_button"): with st.spinner("Tuning hyperparameters..."): try: tuned_model, best_params = tune_model(model, X_train, y_train, analysis_type, selected_model_name) tuned_model.fit(X_train, y_train) y_pred = tuned_model.predict(X_test) metrics = evaluate_model(tuned_model, X_test, y_test, analysis_type, y_pred) st.markdown("### Tuned Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) st.write("**Best Hyperparameters**:", best_params) # Update stored model st.session_state['trained_model'] = tuned_model st.session_state['model_name'] = selected_model_name + "_Tuned" except Exception as e: st.error(f"Tuning error: {str(e)}") # Save model if 'trained_model' in st.session_state and st.button("Save The Model", key="save_button"): model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) st.download_button( label="Download Model", data=model_file, file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", mime="application/octet-stream", key="download_model" ) # Clustering elif analysis_type == "Clustering": st.markdown("### Configure Clustering") feature_cols = st.multiselect("Select Features for Clustering", df.columns, key="cluster_cols") if not feature_cols: st.warning("Please select at least one feature column.") return # Preprocess data try: X, _, preprocessor, _ = preprocess_data(df[feature_cols], task_type="Clustering") except Exception as e: st.error(f"Preprocessing error: {str(e)}") return n_clusters = st.slider("Number of Clusters (for applicable algorithms)", 2, 10, 3, key="n_clusters") clustering_models = { "K-Means": KMeans(n_clusters=n_clusters, random_state=42), "DBSCAN": DBSCAN(eps=0.5, min_samples=5), "Agglomerative Clustering": AgglomerativeClustering(n_clusters=n_clusters), "Spectral Clustering": SpectralClustering(n_clusters=n_clusters, random_state=42), "OPTICS": OPTICS(min_samples=5) } selected_model_name = st.selectbox("Select Clustering Algorithm", list(clustering_models.keys()), key="cluster_model_select") model = clustering_models[selected_model_name] # Train model if st.button("Perform Clustering", key="cluster_button"): with st.spinner("Performing clustering..."): try: clusters = model.fit_predict(X) df_with_clusters = df.copy() df_with_clusters['Cluster'] = clusters st.markdown("### Clustered Data Sample") st.dataframe(df_with_clusters.head()) # Evaluate clustering metrics = evaluate_model(model, X, clusters, "Clustering", clusters) st.markdown("### Clustering Metrics") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) # Store model st.session_state['trained_model'] = model st.session_state['model_name'] = selected_model_name except Exception as e: st.error(f"Clustering error: {str(e)}") return # Hyperparameter tuning if st.button("Perform Hyperparameter Tuning", key="cluster_tune_button"): with st.spinner("Tuning hyperparameters..."): try: tuned_model, best_params = tune_model(model, X, clusters, "Clustering", selected_model_name) clusters = tuned_model.fit_predict(X) df_with_clusters = df.copy() df_with_clusters['Cluster'] = clusters st.markdown("### Tuned Clustered Data Sample") st.dataframe(df_with_clusters.head()) metrics = evaluate_model(tuned_model, X, clusters, "Clustering", clusters) st.markdown("### Tuned Clustering Metrics") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) st.write("**Best Hyperparameters**:", best_params) # Update stored model st.session_state['trained_model'] = tuned_model st.session_state['model_name'] = selected_model_name + "_Tuned" except Exception as e: st.error(f"Tuning error: {str(e)}") # Save model if 'trained_model' in st.session_state and st.button("Save The Model", key="cluster_save_button"): model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) st.download_button( label="Download Model", data=model_file, file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", mime="application/octet-stream", key="download_cluster_model" ) # Time Series Forecasting elif analysis_type == "Time Series Forecasting": if not (HAS_STATSMODELS or HAS_PROPHET): st.error("Please install statsmodels or prophet: `pip install statsmodels fbprophet`") return st.markdown("### Configure Time Series Forecasting") datetime_cols = df.select_dtypes(include=['datetime64']).columns if datetime_cols.empty: st.error("No datetime columns found. Please ensure a datetime column exists.") return date_col = st.selectbox("Select Date Column", datetime_cols, key="date_col") value_col = st.selectbox("Select Value Column", df.select_dtypes(include=['float64', 'int64']).columns, key="value_col") # Prepare data try: ts_df = df[[date_col, value_col]].sort_values(date_col).dropna() ts_df[date_col] = pd.to_datetime(ts_df[date_col]) except Exception as e: st.error(f"Data preparation error: {str(e)}") return # Time-based split train_size = st.slider("Training Data Proportion (%)", 50, 95, 80, key="train_size") train_size = int(len(ts_df) * (train_size / 100)) train, test = ts_df[:train_size], ts_df[train_size:] # Model options forecast_models = {} if HAS_STATSMODELS: forecast_models.update({ "ARIMA": lambda data: ARIMA(data[value_col], order=(1,1,1)).fit(), "Exponential Smoothing": lambda data: ExponentialSmoothing( data[value_col], trend='add', seasonal='add', seasonal_periods=12 ).fit() }) if HAS_PROPHET: forecast_models["Prophet"] = lambda data: Prophet().fit( data.rename(columns={date_col: 'ds', value_col: 'y'}) ) selected_model_name = st.selectbox("Select Forecasting Model", list(forecast_models.keys()), key="ts_model_select") # Train model if st.button("Train Model", key="ts_train_button"): with st.spinner("Training time series model..."): try: if selected_model_name == "Prophet": model = forecast_models[selected_model_name](train) future = model.make_future_dataframe(periods=len(test)) forecast = model.predict(future) y_pred = forecast['yhat'][-len(test):].values y_test = test[value_col].values else: model = forecast_models[selected_model_name](train) y_pred = model.forecast(steps=len(test)) y_test = test[value_col].values # Evaluate metrics = evaluate_model(model, test, y_test, "Time Series Forecasting", y_pred) st.markdown("### Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) # Plot forecast fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(train[date_col], train[value_col], label="Train") ax.plot(test[date_col], test[value_col], label="Test") ax.plot(test[date_col], y_pred, label="Forecast") ax.set_title(f"{selected_model_name} Forecast") ax.legend() st.pyplot(fig) plt.close(fig) # Store model st.session_state['trained_model'] = model st.session_state['model_name'] = selected_model_name except Exception as e: st.error(f"Training error: {str(e)}") return # Hyperparameter tuning if st.button("Perform Hyperparameter Tuning", key="ts_tune_button"): with st.spinner("Tuning hyperparameters..."): try: if selected_model_name == "Prophet": param_grid = { "changepoint_prior_scale": [0.01, 0.05, 0.1], "seasonality_prior_scale": [5, 10, 15] } best_score = float('inf') best_params = {} best_model = None for cps in param_grid["changepoint_prior_scale"]: for sps in param_grid["seasonality_prior_scale"]: model = Prophet(changepoint_prior_scale=cps, seasonality_prior_scale=sps) model.fit(train.rename(columns={date_col: 'ds', value_col: 'y'})) future = model.make_future_dataframe(periods=len(test)) forecast = model.predict(future) score = mean_squared_error(test[value_col], forecast['yhat'][-len(test):]) if score < best_score: best_score = score best_params = {"changepoint_prior_scale": cps, "seasonality_prior_scale": sps} best_model = model model = best_model else: model, best_params = tune_model(model, train[value_col], None, "Time Series Forecasting", selected_model_name) # Re-evaluate if selected_model_name == "Prophet": future = model.make_future_dataframe(periods=len(test)) forecast = model.predict(future) y_pred = forecast['yhat'][-len(test):].values y_test = test[value_col].values else: model = forecast_models[selected_model_name](train) y_pred = model.forecast(steps=len(test)) y_test = test[value_col].values metrics = evaluate_model(model, test, y_test, "Time Series Forecasting", y_pred) st.markdown("### Tuned Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) st.write("**Best Hyperparameters**:", best_params) # Plot tuned forecast fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(train[date_col], train[value_col], label="Train") ax.plot(test[date_col], test[value_col], label="Test") ax.plot(test[date_col], y_pred, label="Tuned Forecast") ax.set_title(f"Tuned {selected_model_name} Forecast") ax.legend() st.pyplot(fig) plt.close(fig) # Update stored model st.session_state['trained_model'] = model st.session_state['model_name'] = selected_model_name + "_Tuned" except Exception as e: st.error(f"Tuning error: {str(e)}") # Save model if 'trained_model' in st.session_state and st.button("Save The Model", key="ts_save_button"): model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) st.download_button( label="Download Model", data=model_file, file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", mime="application/octet-stream", key="download_ts_model" ) # Clear Modified Dataset def clear_modified_dataset(): st.subheader("Clear Modified Dataset") st.session_state.pop('uploaded_df', None) st.write("Dataset cleared.") # Chat with the dataset def chat_with_dataset(df): st.subheader("Chat with Your Dataset") st.write("Ask questions about your dataset. For example, 'What is the average value of column X?' or 'Show me the top 5 rows.'") user_query = st.text_area("Enter your query:", height=100) if st.button("Ask"): if not user_query.strip(): st.warning("Please enter a query.") return try: chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": ( "You are an expert data analyst. Answer the user's questions about the provided pandas DataFrame. " "Use Python pandas to analyze the data and provide concise answers. " "If the user asks for code, generate Python code snippets using pandas to perform the requested operation. " "Do not include explanations unless explicitly requested." ), }, {"role": "user", "content": f"The dataset is:\n{df.head(5).to_string()}\n\n{user_query}"}, ], model="llama-3.3-70b-versatile", ) response = chat_completion.choices[0].message.content.strip() st.write("Response:") st.code(response, language="python" if "def " in response or "import " in response else None) st.write("You can execute the generated code below:") if st.button("Execute Generated Code"): try: safe_globals = {"pd": pd, "plt": plt, "sns": sns, "df": df, "io": io, "np": np} safe_locals = {} exec(response, safe_globals, safe_locals) # Check for matplotlib or seaborn plots if "plt." in response or "sns." in response: st.pyplot(plt.gcf()) plt.clf() # Check for DataFrame outputs elif "pd.DataFrame" in response or "df" in response: output_df = safe_locals.get("df", None) if isinstance(output_df, pd.DataFrame): st.write("Generated DataFrame:") st.dataframe(output_df) else: st.write("Code executed successfully. Check the output above if applicable.") else: st.write("Code executed successfully. Check the output above if applicable.") except Exception as e: st.error(f"Error executing code: {str(e)}") except Exception as e: st.error(f"Error with Groq API: {str(e)}") # NLP Pipeline def nlp_pipeline_tab(): """Function for the NLP Pipeline tab.""" import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix) from sklearn.feature_extraction.text import TfidfVectorizer from gensim import corpora from gensim.models import LdaModel import spacy import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from io import BytesIO import pickle st.header("NLP Pipeline") st.markdown("Perform Natural Language Processing tasks like text classification, sentiment analysis, topic modeling, and named entity recognition.") # Access uploaded dataset from session state if 'uploaded_df' not in st.session_state: st.error("No dataset uploaded. Please upload a CSV file in the main DataGenie app.") return df = st.session_state['uploaded_df'] st.write("Dataset Preview:") st.dataframe(df.head()) # Select NLP task nlp_task = st.selectbox("Select NLP Task", ["Text Classification", "Sentiment Analysis", "Topic Modeling", "Named Entity Recognition"], help="Choose the specific NLP task to perform.") # Preprocessing function def preprocess_nlp_data(df, text_col, target_col=None, task_type=None): stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def clean_text(text): if not isinstance(text, str): return "" tokens = word_tokenize(text.lower()) tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()] tokens = [token for token in tokens if token not in stop_words] return " ".join(tokens) X_text = df[text_col].apply(clean_text) if task_type in ["Text Classification", "Sentiment Analysis"]: vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) X = vectorizer.fit_transform(X_text).toarray() y = df[target_col] if target_col else None if task_type == "Text Classification" and y.dtype in ['object', 'category']: le = LabelEncoder() y = le.fit_transform(y) return X, y, vectorizer, le return X, y, vectorizer, None elif task_type == "Topic Modeling": texts = [text.split() for text in X_text] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] return corpus, None, dictionary, None elif task_type == "Named Entity Recognition": return X_text, None, None, None # Evaluation function def evaluate_model(model, X_test, y_test, task_type, y_pred=None, vectorizer=None): metrics = {} if task_type in ["Text Classification", "Sentiment Analysis"]: if y_pred is None: y_pred = model.predict(X_test) metrics.update({ "Accuracy": accuracy_score(y_test, y_pred), "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0), "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0), "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0), "ROC AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1], multi_class='ovr') if hasattr(model, "predict_proba") else np.nan }) cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) ax.set_title("Confusion Matrix") ax.set_xlabel("Predicted") ax.set_ylabel("True") st.pyplot(fig) plt.close(fig) elif task_type == "Topic Modeling": metrics["Number of Topics"] = model.num_topics topics = model.print_topics(num_words=5) for topic_id, topic in topics: st.write(f"**Topic {topic_id}**: {topic}") return metrics # Hyperparameter tuning function def tune_model(model, X_train, y_train, task_type, model_name): param_grids = { "Logistic Regression": {"C": np.logspace(-3, 3, 10), "penalty": ['l2'], "max_iter": [1000]}, "Random Forest Classifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}, "SVM Classifier": {"C": [0.1, 1, 10], "kernel": ['rbf', 'linear']}, "Gradient Boosting Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, "Naive Bayes": {"alpha": [0.1, 0.5, 1.0]}, "LDA": {"num_topics": [5, 10, 15, 20]} } if model_name in param_grids and param_grids[model_name]: search = RandomizedSearchCV(model, param_distributions=param_grids[model_name], n_iter=10, cv=3, scoring='accuracy' if task_type in ["Text Classification", "Sentiment Analysis"] else None, n_jobs=-1, random_state=42) search.fit(X_train, y_train) return search.best_estimator_, search.best_params_ return model, {} # Model saving function def save_model(model, model_name): model_file = BytesIO() pickle.dump(model, model_file) model_file.seek(0) return model_file # Identify text columns text_cols = df.select_dtypes(include=['object']).columns.tolist() if not text_cols: st.error("No text columns found in the dataset.") return text_col = st.selectbox("Select Text Column", text_cols, key="nlp_text_col") # Text Classification and Sentiment Analysis if nlp_task in ["Text Classification", "Sentiment Analysis"]: target_col = st.selectbox("Select Target Variable", [col for col in df.columns if col != text_col], key="nlp_target_col") if nlp_task == "Sentiment Analysis" and df[target_col].nunique() > 10: st.error("Target has too many unique values for sentiment analysis.") return try: X, y, vectorizer, le = preprocess_nlp_data(df, text_col, target_col, nlp_task) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) except Exception as e: st.error(f"Preprocessing error: {str(e)}") return model_options = { "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000), "Random Forest Classifier": RandomForestClassifier(random_state=42), "SVM Classifier": SVC(random_state=42, probability=True), "Naive Bayes": MultinomialNB(), "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42) } selected_model_name = st.selectbox("Select Model", list(model_options.keys()), key="nlp_model_select") model = model_options[selected_model_name] if st.button("Train Model", key="nlp_train_button"): with st.spinner("Training NLP model..."): try: model.fit(X_train, y_train) y_pred = model.predict(X_test) metrics = evaluate_model(model, X_test, y_test, nlp_task, y_pred, vectorizer) st.markdown("### Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') st.write(f"**Cross-Validation Scores** (5-fold): Mean = {cv_scores.mean():.4f}, Std = {cv_scores.std():.4f}") if selected_model_name in ["Random Forest Classifier", "Gradient Boosting Classifier"]: feature_names = vectorizer.get_feature_names_out() importance = model.feature_importances_ top_features = pd.DataFrame({"Feature": feature_names, "Importance": importance}).nlargest(10, "Importance") fig = px.bar(top_features, x="Feature", y="Importance", title="Top 10 Feature Importances") st.plotly_chart(fig) st.session_state['nlp_trained_model'] = model st.session_state['nlp_model_name'] = selected_model_name st.session_state['vectorizer'] = vectorizer st.session_state['label_encoder'] = le except Exception as e: st.error(f"Training error: {str(e)}") if st.button("Perform Hyperparameter Tuning", key="nlp_tune_button"): with st.spinner("Tuning hyperparameters..."): try: tuned_model, best_params = tune_model(model, X_train, y_train, nlp_task, selected_model_name) tuned_model.fit(X_train, y_train) y_pred = tuned_model.predict(X_test) metrics = evaluate_model(tuned_model, X_test, y_test, nlp_task, y_pred, vectorizer) st.markdown("### Tuned Model Performance") st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) st.write("**Best Hyperparameters**:", best_params) st.session_state['nlp_trained_model'] = tuned_model st.session_state['nlp_model_name'] = selected_model_name + "_Tuned" except Exception as e: st.error(f"Tuning error: {str(e)}") if 'nlp_trained_model' in st.session_state and st.button("Save The Model", key="nlp_save_button"): model_file = save_model(st.session_state['nlp_trained_model'], st.session_state['nlp_model_name']) st.download_button( label="Download Model", data=model_file, file_name=f"{st.session_state['nlp_model_name'].replace(' ', '_').lower()}_model.pkl", mime="application/octet-stream", key="download_nlp_model" ) # Topic Modeling elif nlp_task == "Topic Modeling": try: corpus, _, dictionary, _ = preprocess_nlp_data(df, text_col, task_type="Topic Modeling") except Exception as e: st.error(f"Preprocessing error: {str(e)}") return num_topics = st.slider("Number of Topics", 2, 20, 5, key="num_topics") model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42) if st.button("Perform Topic Modeling", key="topic_button"): with st.spinner("Performing topic modeling..."): try: metrics = evaluate_model(model, corpus, None, "Topic Modeling") st.markdown("### Topic Modeling Results") st.write("**Topics Identified**:") for topic_id, topic in metrics.items(): if isinstance(topic, str): st.write(f"- {topic}") st.session_state['nlp_trained_model'] = model st.session_state['nlp_model_name'] = "LDA" except Exception as e: st.error(f"Topic modeling error: {str(e)}") if st.button("Perform Hyperparameter Tuning", key="topic_tune_button"): with st.spinner("Tuning topics..."): try: tuned_model, best_params = tune_model(model, corpus, None, "Topic Modeling", "LDA") metrics = evaluate_model(tuned_model, corpus, None, "Topic Modeling") st.markdown("### Tuned Topic Modeling Results") st.write("**Tuned Topics Identified**:") for topic_id, topic in metrics.items(): if isinstance(topic, str): st.write(f"- {topic}") st.write("**Best Hyperparameters**:", best_params) st.session_state['nlp_trained_model'] = tuned_model st.session_state['nlp_model_name'] = "LDA_Tuned" except Exception as e: st.error(f"Tuning error: {str(e)}") if 'nlp_trained_model' in st.session_state and st.button("Save The Model", key="topic_save_button"): model_file = save_model(st.session_state['nlp_trained_model'], st.session_state['nlp_model_name']) st.download_button( label="Download Model", data=model_file, file_name=f"{st.session_state['nlp_model_name'].replace(' ', '_').lower()}_model.pkl", mime="application/octet-stream", key="download_topic_model" ) # Named Entity Recognition elif nlp_task == "Named Entity Recognition": try: X_text, _, _, _ = preprocess_nlp_data(df, text_col, task_type="Named Entity Recognition") except Exception as e: st.error(f"Preprocessing error: {str(e)}") return nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat"]) if st.button("Perform NER", key="ner_button"): with st.spinner("Performing Named Entity Recognition..."): try: st.markdown("### NER Results") for text in X_text[:10]: doc = nlp(text) st.write(f"**Text**: {text[:100]}...") entities = [(ent.text, ent.label_) for ent in doc.ents] if entities: st.dataframe(pd.DataFrame(entities, columns=["Entity", "Label"])) else: st.write("No entities detected.") st.info("NER uses a pre-trained spaCy model, so no training or saving is required.") except Exception as e: st.error(f"NER error: {str(e)}") # Main app layout add_custom_styles() st.title("") add_header() tab1, tab2, tab3, tab4 = st.tabs(["Dataset Generator", "Example Prompts", "Chat with Dataset", "NLP Pipeline"]) with tab1: st.header("Generate Synthetic Datasets") st.write("Enter a prompt to generate a synthetic dataset. Be as descriptive as possible (e.g., 'Generate 500 rows for heart risk prediction with age, common symptoms like chest pain and shortness of breath, and a risk level (yes/no)'). For more examples, check the 'Example Prompts' tab.") prompt = st.text_area("Your prompt:", height=100) if "generated_code" not in st.session_state: st.session_state.generated_code = None st.session_state.expected_rows = None if st.button("Generate Code"): if prompt: code = generate_dataset_code(prompt) if code: st.session_state.generated_code = code st.session_state.expected_rows = extract_row_count(prompt) st.subheader("Generated Python Code") st.code(code, language="python") st.info("Review the code and click 'Get the Dataset'.") else: st.error("Generated code does not define create_dataset function correctly.") else: st.warning("Enter a prompt.") if st.session_state.generated_code and st.button("Get the Dataset"): df = execute_code(st.session_state.generated_code) if df is not None: if len(df) != st.session_state.expected_rows: st.warning(f"Dataset has {len(df)} rows; requested {st.session_state.expected_rows}.") st.subheader("Generated Dataset") st.write(f"Rows: {len(df)}, Columns: {', '.join(df.columns)}") st.dataframe(df.head()) csv_bytes = to_csv_bytes(df) st.download_button(label="Download CSV", data=csv_bytes, file_name="datagenie_dataset.csv", mime="text/csv") with tab2: st.header("Example Prompts") st.write("Explore example prompts to generate synthetic datasets for various domains.") st.subheader("💼 Finance & Business") st.write("Generate 1000 customer records for a bank with age, income, loan amount, credit score, and defaulted (Yes/No).") st.write("Create 500 rows of sales data with product category, region, sales amount, profit margin, and sales channel (Online/Offline).") st.write("Generate 200 rows of stock market data with date, opening price, closing price, highest price, lowest price, and trading volume.") st.subheader("🧑‍🎓 Education") st.write("Create 700 student records with study hours, attendance, and final grade (A, B, C, D, F).") st.write("Generate 300 rows of teacher performance data with years of experience, subject taught, average student score, and teacher rating (1-5).") st.write("Generate 1000 rows of university admission data with applicant age, GPA, SAT score, extracurricular activities, and admission status (Accepted/Rejected).") st.subheader("🌍 Environment") st.write("Generate 365 days of air quality data with PM2.5, PM10, CO2, and air quality (Good, Moderate, Hazardous).") st.write("Create 500 rows of weather data with date, temperature, humidity, wind speed, and precipitation level.") st.write("Generate 1000 rows of energy consumption data with household size, monthly usage (kWh), energy source (Solar, Wind, Grid), and cost.") st.subheader("🏥 Healthcare") st.write("Generate 1000 patient records with age, gender, blood pressure, cholesterol level, and diagnosis (Healthy, At Risk, Critical).") st.write("Create 500 rows of hospital data with department, number of patients, average treatment cost, and satisfaction rating (1-5).") st.write("Generate 300 rows of clinical trial data with participant ID, age, treatment type, side effects (Yes/No), and outcome (Improved/Unchanged/Worsened).") st.subheader("🚗 Transportation") st.write("Generate 1000 rows of vehicle data with make, model, year, fuel efficiency (mpg), and price.") st.write("Create 500 rows of traffic data with date, time, location, number of vehicles, and average speed.") st.write("Generate 300 rows of ride-sharing data with driver ID, trip distance, trip duration, fare amount, and rating (1-5).") st.subheader("🛒 Retail & E-commerce") st.write("Generate 1000 rows of customer purchase data with customer ID, product category, purchase amount, and payment method (Credit Card, PayPal, Cash).") st.write("Create 500 rows of inventory data with product ID, category, stock level, reorder point, and supplier.") st.write("Generate 300 rows of website analytics data with date, page views, unique visitors, bounce rate, and conversion rate.") st.subheader("🏗️ Construction & Real Estate") st.write("Generate 500 rows of real estate data with property type, location, size (sq ft), price, and status (Available/Sold).") st.write("Create 300 rows of construction project data with project ID, start date, end date, budget, and completion status (On Track/Delayed).") st.write("Generate 200 rows of rental data with property type, monthly rent, tenant age, and lease duration (months).") st.subheader("🎮 Gaming & Entertainment") st.write("Generate 1000 rows of gaming data with player ID, game title, hours played, in-game purchases, and player rank.") st.write("Create 500 rows of movie data with title, genre, release year, box office revenue, and IMDb rating.") st.write("Generate 300 rows of music streaming data with user ID, song title, artist, play count, and duration (minutes).") with tab3: st.header("Chat with Dataset") uploaded_file = st.file_uploader("Upload CSV for Chatting", type="csv") if uploaded_file: try: df = pd.read_csv(uploaded_file) st.success("File uploaded successfully!") chat_with_dataset(df) except Exception as e: st.error(f"Error loading CSV file: {str(e)}") else: st.info("Upload a CSV file to start chatting.") # NLP Pipeline Tab with tab4: nlp_pipeline_tab() # Line ~1100: Footer add_footer() # Sidebar for data processing and visualization add_sidebar() feature_options = st.sidebar.radio("Select Option", ["Dataset Overview", "Clean Data", "Detect Outlier", "Encoder", "Data Transformer", "Data Analysis", "Feature Importance Analyzer", "Best Parameter Selector", "Train The Dataset", "Clear Modified Dataset", "Visualizations"]) if 'uploaded_df' in st.session_state: df = st.session_state['uploaded_df'] try: if feature_options == "Dataset Overview": dataset_overview(df) elif feature_options == "Clean Data": st.session_state['uploaded_df'] = clean_data(df) elif feature_options == "Detect Outlier": detect_outlier(df) elif feature_options == "Encoder": st.session_state['uploaded_df'] = encoder(df) elif feature_options == "Data Transformer": st.session_state['uploaded_df'] = data_transformer(df) elif feature_options == "Data Analysis": data_analysis(df) elif feature_options == "Feature Importance Analyzer": feature_importance_analyzer(df) elif feature_options == "Best Parameter Selector": best_parameter_selector(df) elif feature_options == "Train The Dataset": select_ml_models(df) elif feature_options == "Clear Modified Dataset": clear_modified_dataset() elif feature_options == "Visualizations": visualize_dataset(df) features = st.sidebar.multiselect("Select features for specific visualizations", df.columns.tolist()) if features: visualize_specific_features(df, features) elif feature_options == "NLP Pipeline": nlp_pipeline_tab() if 'uploaded_df' in st.session_state: df = st.session_state['uploaded_df'] except Exception as e: st.error(f"Error processing dataset: {str(e)}") else: st.sidebar.info("Upload a CSV to proceed.")