Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from faker import Faker | |
| import random | |
| from groq import Groq | |
| from io import BytesIO | |
| import ast | |
| import re | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import io | |
| from datetime import datetime | |
| from sklearn.preprocessing import LabelEncoder | |
| import matplotlib | |
| from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
| from sklearn.model_selection import train_test_split, GridSearchCV | |
| from sklearn.linear_model import LogisticRegression, LinearRegression | |
| from sklearn.svm import SVC, SVR | |
| from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
| from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier | |
| from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor | |
| import numpy as np | |
| from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, | |
| mean_absolute_error, mean_squared_error, r2_score, | |
| silhouette_score, davies_bouldin_score, calinski_harabasz_score) | |
| # For NLP Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from gensim import corpora | |
| from gensim.models import LdaModel | |
| import spacy | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| # Download NLTK resources | |
| import nltk | |
| import os | |
| nltk.data.path.append(os.path.join(os.path.dirname(__file__), 'nltk_data')) | |
| try: | |
| nltk.data.find('tokenizers/punkt_tab') | |
| nltk.data.find('corpora/stopwords') | |
| nltk.data.find('corpora/wordnet') | |
| except LookupError: | |
| nltk.download('punkt_tab') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| # Ensure spaCy model | |
| try: | |
| spacy.load("en_core_web_sm") | |
| except OSError: | |
| import os | |
| os.system("python -m spacy download en_core_web_sm") | |
| # Conditional import for time series models | |
| try: | |
| from statsmodels.tsa.holtwinters import ExponentialSmoothing | |
| from statsmodels.tsa.arima.model import ARIMA | |
| HAS_STATSMODELS = True | |
| except ImportError: | |
| HAS_STATSMODELS = False | |
| # Set matplotlib backend for Streamlit compatibility | |
| matplotlib.use('Agg') | |
| # Initialize Faker and apply custom styles | |
| fake = Faker() | |
| def add_custom_styles(): | |
| st.markdown( | |
| """ | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap'); | |
| html, body, [class*="css"] { | |
| font-family: 'Roboto', sans-serif; | |
| background-color: #f4f4f9; | |
| } | |
| .stButton>button { | |
| background-color: #4CAF50; | |
| color: white; | |
| border: none; | |
| padding: 10px 20px; | |
| border-radius: 5px; | |
| font-size: 16px; | |
| } | |
| .stButton>button:hover { | |
| background-color: #45a049; | |
| } | |
| .header-banner { | |
| text-align: center; | |
| margin-bottom: 20px; | |
| } | |
| .header-banner img { | |
| max-width: 150px; | |
| margin-bottom: 10px; | |
| } | |
| .header-banner h1 { | |
| font-size: 36px; | |
| color: #333; | |
| margin: 0; | |
| } | |
| .header-banner p { | |
| font-size: 16px; | |
| color: #666; | |
| } | |
| footer { | |
| text-align: center; | |
| margin-top: 50px; | |
| padding: 10px; | |
| font-size: 14px; | |
| color: #888; | |
| } | |
| footer a { | |
| color: #4CAF50; | |
| text-decoration: none; | |
| } | |
| footer a:hover { | |
| text-decoration: underline; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| def add_header(): | |
| st.markdown( | |
| """ | |
| <div class="header-banner"> | |
| <img src="https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png" alt="DataGenie Logo" style="max-width: 280px;"> | |
| <p>Empowering your data journey with AI-driven insights and synthetic datasets</p> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| st.markdown("### Upload Your Dataset for Preprocessing, Training, and EDA") | |
| uploaded_file = st.file_uploader("Upload CSV", type="csv") | |
| if uploaded_file: | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| st.success("Dataset uploaded successfully!") | |
| st.session_state['uploaded_df'] = df | |
| st.write("Preview of the uploaded dataset:") | |
| st.dataframe(df.head()) | |
| except Exception as e: | |
| st.error(f"Error loading CSV file: {str(e)}") | |
| else: | |
| st.info("Upload a CSV file to get started. And Go to the Sidebar to start working on your dataset") | |
| def add_footer(): | |
| st.markdown( | |
| """ | |
| <footer> | |
| Developed by <a href="https://github.com/Mahatir-Ahmed-Tusher" target="_blank">Mahatir Ahmed Tusher</a>. | |
| Inspired by the project "Predicta" by <a href="https://github.com/ahammadnafiz" target="_blank"> Ahammad Nafiz </a>. | |
| </footer> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| def add_sidebar(): | |
| st.sidebar.image( | |
| "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png", | |
| width=150, | |
| caption="DataGenie" | |
| ) | |
| st.sidebar.markdown("---") | |
| st.sidebar.title("About DataGenie") | |
| st.sidebar.info( | |
| "DataGenie: AI-powered data science assistant. Generate datasets, analyze data, build ML models. Features: dataset generation, visualization, outlier detection, feature processing, ML model selection, and chat-based exploration." | |
| ) | |
| st.sidebar.write("**Developed by:** Mahatir Ahmed Tusher") | |
| st.sidebar.write("**Inspired by:** Predicta by Ahammad Nafiz") | |
| st.sidebar.markdown("---") | |
| st.sidebar.write("**Your**") | |
| st.sidebar.image( | |
| "https://i.postimg.cc/5y20B10S/89c59ca6-c8a8-4210-ba7b-f77a44a8fa3a-removalai-preview.png", | |
| width=150 | |
| ) | |
| # App configuration | |
| APP_NAME = "DataGenie" | |
| # Initialize Groq client with API key | |
| GROQ_API_KEY = "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" | |
| try: | |
| client = Groq(api_key=GROQ_API_KEY) | |
| except Exception as e: | |
| st.error(f"Invalid Groq API key: {str(e)}. Please update GROQ_API_KEY.") | |
| st.stop() | |
| # Utility functions | |
| def extract_row_count(prompt): | |
| match = re.search(r'(\d+)\s*(rows|records|entries)', prompt, re.IGNORECASE) | |
| return int(match.group(1)) if match else 100 | |
| def generate_dataset_code(prompt): | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are an expert Python code generator specializing in creating synthetic datasets using pandas, faker, and random. " | |
| "Based on the user's natural language prompt, generate a valid Python function named `create_dataset()` that returns a pandas DataFrame. " | |
| "Follow these strict rules:\n" | |
| "1. The function must start exactly with `def create_dataset():` and take no arguments.\n" | |
| "2. Use only `pd` (pandas), `fake` (Faker), and `random` (random module) within the function.\n" | |
| "3. Extract the number of rows from the prompt (e.g., '500 rows' or '1000 records') and use `range(<row_count>)` to generate exactly that many rows. If no row count is specified, default to 100 rows.\n" | |
| "4. Generate realistic data for all columns specified in the prompt, respecting any domain-specific details (e.g., age between 18-80, prices in USD, regional names).\n" | |
| "5. For target columns (e.g., 'yes/no', 'percentage', 'price', 'category'), use appropriate distributions or logic (e.g., random.choice(['Yes', 'No']), random.uniform(0, 100) for percentages).\n" | |
| "6. Ensure data types are correct: integers for counts, floats for percentages/prices, strings for names/emails, etc.\n" | |
| "7. The function must end with `return pd.DataFrame(data)` where `data` is a dictionary of column lists.\n" | |
| "8. Do not include comments, markdown, explanations, or extra text outside the function definition.\n" | |
| "Example for prompt 'Generate 200 rows of customer data with name, age, email, and purchase_amount':\n" | |
| "def create_dataset():\n" | |
| " data = {\n" | |
| " 'name': [fake.name() for _ in range(200)],\n" | |
| " 'age': [random.randint(18, 80) for _ in range(200)],\n" | |
| " 'email': [fake.email() for _ in range(200)],\n" | |
| " 'purchase_amount': [round(random.uniform(10.0, 500.0), 2) for _ in range(200)]\n" | |
| " }\n" | |
| " return pd.DataFrame(data)\n" | |
| "Handle edge cases gracefully, such as missing column details, by using reasonable defaults. " | |
| "Ensure the code is syntactically correct and executable. Remember, in case of classification yes means 1 and no means 0." | |
| ), | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| code = chat_completion.choices[0].message.content.strip() | |
| if not code.startswith("def create_dataset():"): | |
| st.error("Generated code does not define create_dataset function correctly.") | |
| st.code(code, language="python") | |
| return None | |
| try: | |
| ast.parse(code) | |
| return code | |
| except SyntaxError as e: | |
| st.error(f"Invalid syntax in generated code: {str(e)}") | |
| st.code(code, language="python") | |
| return None | |
| except Exception as e: | |
| st.error(f"Error with Groq API: {str(e)}") | |
| return None | |
| def execute_code(code): | |
| safe_globals = { | |
| "pd": pd, | |
| "fake": fake, | |
| "random": random, | |
| "__builtins__": { | |
| "range": range, "list": list, "int": int, "str": str, "float": float, | |
| "round": round, "True": True, "False": False, "zip": zip, | |
| }, | |
| } | |
| safe_locals = {} | |
| try: | |
| exec(code, safe_globals, safe_locals) | |
| create_dataset = safe_locals.get("create_dataset") | |
| if not create_dataset: | |
| st.error("No create_dataset function defined.") | |
| return None | |
| df = create_dataset() | |
| if not isinstance(df, pd.DataFrame): | |
| st.error("Generated code did not return a pandas DataFrame.") | |
| return None | |
| return df | |
| except Exception as e: | |
| st.error(f"Execution error: {str(e)}") | |
| return None | |
| def to_csv_bytes(df): | |
| output = BytesIO() | |
| df.to_csv(output, index=False) | |
| output.seek(0) | |
| return output | |
| # Visualization functions | |
| def visualize_dataset(df): | |
| st.subheader("Dataset Visualizations") | |
| if df.empty or not isinstance(df, pd.DataFrame): | |
| st.warning("No valid data to visualize.") | |
| return | |
| numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist() | |
| all_cols = numerical_cols + categorical_cols + datetime_cols | |
| if not all_cols: | |
| st.warning("No columns available to visualize.") | |
| return | |
| viz_type = st.sidebar.selectbox("Select Visualization Type", | |
| ["Histogram", "Box Plot", "Scatter Plot", "Count Plot", | |
| "Correlation Heatmap"] + (["Time Series"] if datetime_cols and numerical_cols else [])) | |
| plt.clf() | |
| try: | |
| if viz_type == "Histogram" and numerical_cols: | |
| col = st.sidebar.selectbox("Select Numerical Column", numerical_cols) | |
| fig, ax = plt.subplots() | |
| sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax) | |
| st.pyplot(fig) | |
| download_image(fig, f"histogram_{col}") | |
| plt.close(fig) | |
| elif viz_type == "Box Plot" and numerical_cols: | |
| col = st.sidebar.selectbox("Select Numerical Column", numerical_cols) | |
| fig, ax = plt.subplots() | |
| sns.boxplot(data=df, y=col, ax=ax) | |
| st.pyplot(fig) | |
| download_image(fig, f"boxplot_{col}") | |
| plt.close(fig) | |
| elif viz_type == "Scatter Plot" and len(numerical_cols) >= 2: | |
| x_col = st.sidebar.selectbox("Select X-axis Column", numerical_cols) | |
| y_col = st.sidebar.selectbox("Select Y-axis Column", [c for c in numerical_cols if c != x_col]) | |
| fig = px.scatter(df, x=x_col, y=y_col) | |
| st.plotly_chart(fig) | |
| img_bytes = io.BytesIO() | |
| fig.write_image(img_bytes, format='png') | |
| st.sidebar.download_button("Download Scatter Plot", img_bytes.getvalue(), | |
| file_name=f"scatter_{x_col}_{y_col}.png", | |
| key=f"scatter_{x_col}_{y_col}_{datetime.now().strftime('%H%M%S')}") | |
| elif viz_type == "Count Plot" and categorical_cols: | |
| col = st.sidebar.selectbox("Select Categorical Column", categorical_cols) | |
| fig, ax = plt.subplots() | |
| sns.countplot(data=df, x=col, ax=ax) | |
| plt.xticks(rotation=45, ha='right') | |
| st.pyplot(fig) | |
| download_image(fig, f"countplot_{col}") | |
| plt.close(fig) | |
| elif viz_type == "Correlation Heatmap" and numerical_cols: | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax) | |
| st.pyplot(fig) | |
| download_image(fig, "correlation_heatmap") | |
| plt.close(fig) | |
| elif viz_type == "Time Series" and datetime_cols and numerical_cols: | |
| datetime_col = st.sidebar.selectbox("Select Datetime Column", datetime_cols) | |
| value_col = st.sidebar.selectbox("Select Value Column", numerical_cols) | |
| df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce') | |
| fig = px.line(df, x=datetime_col, y=value_col) | |
| st.plotly_chart(fig) | |
| img_bytes = io.BytesIO() | |
| fig.write_image(img_bytes, format='png') | |
| st.sidebar.download_button("Download Time Series", img_bytes.getvalue(), | |
| file_name=f"time_series_{datetime_col}_{value_col}.png", | |
| key=f"timeseries_{datetime_col}_{value_col}_{datetime.now().strftime('%H%M%S')}") | |
| except Exception as e: | |
| st.error(f"Visualization error: {str(e)}") | |
| def visualize_specific_features(df, features): | |
| st.subheader("Feature-Specific Visualizations") | |
| for feature in features: | |
| if feature not in df.columns: | |
| st.warning(f"Feature '{feature}' not found.") | |
| continue | |
| fig, ax = plt.subplots() | |
| try: | |
| if pd.api.types.is_numeric_dtype(df[feature]): | |
| sns.histplot(data=df, x=feature, kde=True, bins='auto', ax=ax) | |
| elif pd.api.types.is_categorical_dtype(df[feature]) or pd.api.types.is_string_dtype(df[feature]): | |
| sns.countplot(data=df, x=feature, ax=ax) | |
| plt.xticks(rotation=45, ha='right') | |
| elif pd.api.types.is_datetime64_any_dtype(df[feature]): | |
| st.warning(f"Use 'Time Series' in main visualization for '{feature}'.") | |
| plt.close(fig) | |
| continue | |
| st.pyplot(fig) | |
| download_image(fig, f"feature_{feature}") | |
| plt.close(fig) | |
| except Exception as e: | |
| st.error(f"Error visualizing '{feature}': {str(e)}") | |
| plt.close(fig) | |
| def download_image(fig, key_prefix): | |
| img_bytes = io.BytesIO() | |
| fig.savefig(img_bytes, format='png', bbox_inches='tight') | |
| img_bytes.seek(0) | |
| st.sidebar.download_button(label="Download Image", data=img_bytes, | |
| file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", | |
| mime="image/png", | |
| key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}") | |
| # Data processing functions | |
| def dataset_overview(df): | |
| st.subheader("Dataset Overview") | |
| st.markdown("#### Basic Information") | |
| st.write(f"**Rows**: {len(df):,} | **Columns**: {len(df.columns):,}") | |
| st.write(f"**Memory Usage**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") | |
| st.markdown("#### Data Types and Missing Values") | |
| dtypes_df = pd.DataFrame({ | |
| "Column": df.columns, | |
| "Data Type": df.dtypes, | |
| "Non-Null Count": df.count(), | |
| "Missing Values": df.isna().sum(), | |
| "Missing %": (df.isna().sum() / len(df) * 100).round(2) | |
| }).reset_index(drop=True) | |
| st.dataframe(dtypes_df.style.highlight_null(color='lightcoral')) | |
| st.markdown("#### Numerical Columns Summary") | |
| numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns | |
| if numerical_cols.size > 0: | |
| numerical_summary = df[numerical_cols].describe().T.round(2) | |
| numerical_summary['Skewness'] = df[numerical_cols].skew().round(2) | |
| numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2) | |
| st.dataframe(numerical_summary) | |
| st.markdown("#### Categorical Columns Summary") | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
| if categorical_cols.size > 0: | |
| for col in categorical_cols: | |
| value_counts = df[col].value_counts().head(5) | |
| st.write(f"**{col}** (Top 5 values):") | |
| st.dataframe(pd.DataFrame({ | |
| "Value": value_counts.index, | |
| "Count": value_counts.values, | |
| "% of Total": (value_counts.values / len(df) * 100).round(2) | |
| })) | |
| st.markdown("#### Duplicate Rows") | |
| duplicate_count = df.duplicated().sum() | |
| if duplicate_count > 0: | |
| st.warning(f"Found {duplicate_count} duplicate rows ({duplicate_count / len(df) * 100:.2f}% of total).") | |
| else: | |
| st.success("No duplicate rows detected.") | |
| st.markdown("#### Sample Data (First 5 Rows)") | |
| st.dataframe(df.head()) | |
| def clean_data(df): | |
| st.subheader("Clean Data") | |
| st.markdown("#### Missing Values") | |
| missing_values = df.isnull().sum() | |
| missing_percentage = (missing_values / len(df) * 100).round(2) | |
| missing_summary = pd.DataFrame({ | |
| "Missing Values": missing_values, | |
| "Missing Percentage (%)": missing_percentage | |
| }).sort_values(by="Missing Values", ascending=False) | |
| st.dataframe(missing_summary) | |
| st.markdown("#### Duplicate Rows") | |
| duplicate_count = df.duplicated().sum() | |
| if duplicate_count > 0: | |
| st.warning(f"Found {duplicate_count} duplicate rows. They will be removed.") | |
| else: | |
| st.success("No duplicate rows detected.") | |
| cleaned_df = df.dropna().drop_duplicates() | |
| st.write(f"Cleaned Dataset: {len(cleaned_df)} rows remaining after cleaning.") | |
| st.dataframe(cleaned_df.head()) | |
| return cleaned_df | |
| def detect_outlier(df): | |
| st.subheader("Detect Outliers") | |
| numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns | |
| if not numerical_cols.any(): | |
| st.warning("No numerical columns available for outlier detection.") | |
| return | |
| st.markdown("#### Outlier Detection Summary") | |
| outlier_summary = [] | |
| for col in numerical_cols: | |
| Q1, Q3 = df[col].quantile([0.25, 0.75]) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] | |
| outlier_count = len(outliers) | |
| outlier_percentage = round((outlier_count / len(df) * 100), 2) | |
| outlier_summary.append({ | |
| "Column": col, | |
| "Outliers": outlier_count, | |
| "Outlier Percentage (%)": outlier_percentage | |
| }) | |
| outlier_df = pd.DataFrame(outlier_summary).sort_values(by="Outliers", ascending=False) | |
| st.dataframe(outlier_df) | |
| st.markdown("#### Outlier Visualization") | |
| selected_col = st.selectbox("Select a column to visualize outliers", numerical_cols) | |
| if selected_col: | |
| Q1, Q3 = df[selected_col].quantile([0.25, 0.75]) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| fig, ax = plt.subplots() | |
| sns.boxplot(data=df, x=selected_col, ax=ax) | |
| ax.axhline(lower_bound, color='red', linestyle='--', label='Lower Bound') | |
| ax.axhline(upper_bound, color='blue', linestyle='--', label='Upper Bound') | |
| ax.legend() | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| # Data Encoder | |
| def encoder(df): | |
| """ | |
| Encodes categorical columns in the dataset using user-selected methods (Label Encoding, | |
| One-Hot Encoding, or Frequency Encoding). Provides control over column selection, handles | |
| missing values, and displays encoding details. | |
| Args: | |
| df (pd.DataFrame): Input dataset to encode. | |
| Returns: | |
| pd.DataFrame: Encoded dataset. | |
| """ | |
| st.subheader("Encode Data") | |
| # Initialize session state for encoded DataFrame | |
| if 'encoded_df' not in st.session_state: | |
| st.session_state.encoded_df = df.copy() | |
| # Identify categorical columns | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if not categorical_cols: | |
| st.warning("No categorical columns ('object' or 'category') found in the dataset.") | |
| return df | |
| # Display original categorical columns | |
| st.markdown("### Categorical Columns Detected") | |
| st.write(f"Found {len(categorical_cols)} categorical columns: {', '.join(categorical_cols)}") | |
| for col in categorical_cols: | |
| st.write(f"- **{col}**: {df[col].nunique()} unique values, " | |
| f"{df[col].isna().sum()} missing ({df[col].isna().sum() / len(df) * 100:.2f}%)") | |
| # User configuration | |
| st.markdown("### Encoding Configuration") | |
| encoding_methods = { | |
| "Label Encoding": "Assigns integers to categories (best for ordinal data).", | |
| "One-Hot Encoding": "Creates binary columns for each category (best for non-ordinal data, avoid high cardinality).", | |
| "Frequency Encoding": "Replaces categories with their frequency (useful for high-cardinality columns)." | |
| } | |
| # Select columns to encode | |
| cols_to_encode = st.multiselect("Select Columns to Encode", categorical_cols, default=categorical_cols, | |
| help="Choose which categorical columns to encode. Unselected columns remain unchanged.") | |
| if not cols_to_encode: | |
| st.warning("Please select at least one column to encode.") | |
| return st.session_state.encoded_df | |
| # Missing value handling | |
| missing_strategy = st.selectbox("Handle Missing Values", | |
| ["Keep as NaN", "Impute with Mode", "Impute with Custom Value"], | |
| help="Choose how to handle missing values before encoding.") | |
| custom_value = None | |
| if missing_strategy == "Impute with Custom Value": | |
| custom_value = st.text_input("Enter Custom Value for Missing Entries", value="Unknown") | |
| # Apply missing value handling | |
| encoded_df = st.session_state.encoded_df.copy() | |
| for col in cols_to_encode: | |
| if missing_strategy == "Impute with Mode": | |
| mode_val = df[col].mode()[0] if not df[col].mode().empty else "Unknown" | |
| encoded_df[col] = df[col].fillna(mode_val) | |
| elif missing_strategy == "Impute with Custom Value": | |
| encoded_df[col] = df[col].fillna(custom_value) | |
| # Encoding method selection per column | |
| st.markdown("### Assign Encoding Methods") | |
| encoding_assignments = {} | |
| for col in cols_to_encode: | |
| default_method = "One-Hot Encoding" if df[col].nunique() <= 10 else "Frequency Encoding" | |
| encoding_assignments[col] = st.selectbox( | |
| f"Encoding Method for {col}", | |
| list(encoding_methods.keys()), | |
| index=list(encoding_methods.keys()).index(default_method), | |
| help=f"{encoding_methods[default_method]} Unique values: {df[col].nunique()}" | |
| ) | |
| # Apply encoding | |
| if st.button("Apply Encoding"): | |
| try: | |
| for col, method in encoding_assignments.items(): | |
| if method == "Label Encoding": | |
| le = LabelEncoder() | |
| # Convert to string to handle mixed types and NaNs | |
| encoded_df[col] = le.fit_transform(encoded_df[col].astype(str)) | |
| st.session_state[f"label_encoder_{col}"] = le # Store encoder for reference | |
| st.write(f"**{col}**: Label Encoded. Classes: {list(le.classes_)}") | |
| elif method == "One-Hot Encoding": | |
| if df[col].nunique() > 50: | |
| st.warning(f"**{col}** has {df[col].nunique()} unique values. One-Hot Encoding may create many columns.") | |
| # Drop NaN for one-hot encoding, reintroduce after | |
| mask = encoded_df[col].notna() | |
| ohe_df = pd.get_dummies(encoded_df.loc[mask, col], prefix=col, drop_first=True) | |
| encoded_df = pd.concat([encoded_df.drop(columns=[col]), ohe_df], axis=1) | |
| encoded_df.loc[~mask, ohe_df.columns] = np.nan | |
| st.write(f"**{col}**: One-Hot Encoded. Created {len(ohe_df.columns)} new columns.") | |
| elif method == "Frequency Encoding": | |
| freq_map = df[col].value_counts(normalize=True).to_dict() | |
| encoded_df[col] = df[col].map(freq_map) | |
| st.write(f"**{col}**: Frequency Encoded. Values mapped to proportions.") | |
| # Update session state | |
| st.session_state.encoded_df = encoded_df | |
| # Display results | |
| st.markdown("### Encoded Dataset Preview") | |
| st.dataframe(encoded_df.head()) | |
| # Data quality check | |
| new_cols = len(encoded_df.columns) - len(df.columns) | |
| if new_cols > 0: | |
| st.info(f"Encoding added {new_cols} new columns.") | |
| if encoded_df.isna().sum().sum() > 0: | |
| st.warning(f"Encoded dataset still has {encoded_df.isna().sum().sum()} missing values.") | |
| # Download option | |
| csv_bytes = encoded_df.to_csv(index=False).encode() | |
| st.download_button( | |
| label="Download Encoded Dataset", | |
| data=csv_bytes, | |
| file_name="encoded_dataset.csv", | |
| mime="text/csv", | |
| key="download_encoded" | |
| ) | |
| except Exception as e: | |
| st.error(f"Error during encoding: {str(e)}") | |
| return df | |
| # Preview current encoded state | |
| else: | |
| st.markdown("### Current Dataset Preview") | |
| st.dataframe(st.session_state.encoded_df.head()) | |
| return st.session_state.encoded_df | |
| # Data Transformer part eta | |
| def data_transformer(df): | |
| st.subheader("Data Transformer") | |
| transformed_df = df.copy() # Placeholder for future transformations | |
| st.write("Transformed Dataset:", transformed_df.head()) | |
| return transformed_df | |
| # Data Analysis | |
| def data_analysis(df): | |
| """ | |
| Performs an in-depth analysis of the dataset, including numerical and categorical summaries, | |
| interactive visualizations, data quality checks, and column-specific exploration. | |
| Args: | |
| df (pd.DataFrame): Input dataset to analyze. | |
| """ | |
| st.subheader("Data Analysis") | |
| # Initialize tabs for different analysis types | |
| tab1, tab2, tab3, tab4 = st.tabs(["Summary Statistics", "Visual Exploration", "Data Quality", "Column Deep Dive"]) | |
| with tab1: | |
| st.markdown("### Summary Statistics") | |
| # Numerical Columns | |
| numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns | |
| if numerical_cols.size > 0: | |
| st.markdown("#### Numerical Columns") | |
| numerical_summary = df[numerical_cols].describe().T.round(2) | |
| numerical_summary['Skewness'] = df[numerical_cols].skew().round(2) | |
| numerical_summary['Kurtosis'] = df[numerical_cols].kurt().round(2) | |
| numerical_summary['Missing %'] = (df[numerical_cols].isna().sum() / len(df) * 100).round(2) | |
| st.dataframe(numerical_summary.style.highlight_max(axis=0, color='lightgreen')) | |
| st.write("*Skewness > 1 or < -1 indicates high skew. Kurtosis > 3 indicates heavy tails.*") | |
| else: | |
| st.info("No numerical columns found.") | |
| # Categorical Columns | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
| if categorical_cols.size > 0: | |
| st.markdown("#### Categorical Columns") | |
| cat_summary = pd.DataFrame({ | |
| "Column": categorical_cols, | |
| "Unique Values": [df[col].nunique() for col in categorical_cols], | |
| "Most Frequent": [df[col].mode()[0] if not df[col].mode().empty else np.nan for col in categorical_cols], | |
| "Missing %": [(df[col].isna().sum() / len(df) * 100).round(2) for col in categorical_cols] | |
| }) | |
| st.dataframe(cat_summary) | |
| else: | |
| st.info("No categorical columns found.") | |
| with tab2: | |
| st.markdown("### Visual Exploration") | |
| viz_type = st.selectbox("Select Visualization Type", | |
| ["Distribution (Numerical)", "Count Plot (Categorical)", "Correlation Heatmap", "Pair Plot"], | |
| key="data_analysis_viz") | |
| if viz_type == "Distribution (Numerical)" and numerical_cols.size > 0: | |
| col = st.selectbox("Select Column", numerical_cols, key="num_dist_col") | |
| fig, ax = plt.subplots() | |
| sns.histplot(data=df, x=col, kde=True, bins='auto', ax=ax) | |
| ax.set_title(f"Distribution of {col}") | |
| st.pyplot(fig) | |
| download_image(fig, f"dist_{col}") | |
| plt.close(fig) | |
| elif viz_type == "Count Plot (Categorical)" and categorical_cols.size > 0: | |
| col = st.selectbox("Select Column", categorical_cols, key="cat_count_col") | |
| fig, ax = plt.subplots() | |
| sns.countplot(data=df, x=col, ax=ax) | |
| plt.xticks(rotation=45, ha='right') | |
| ax.set_title(f"Count Plot of {col}") | |
| st.pyplot(fig) | |
| download_image(fig, f"count_{col}") | |
| plt.close(fig) | |
| elif viz_type == "Correlation Heatmap" and numerical_cols.size > 1: | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", vmin=-1, vmax=1, fmt='.2f', ax=ax) | |
| ax.set_title("Correlation Heatmap") | |
| st.pyplot(fig) | |
| download_image(fig, "corr_heatmap") | |
| plt.close(fig) | |
| elif viz_type == "Pair Plot" and numerical_cols.size > 0: | |
| selected_cols = st.multiselect("Select Columns (max 4)", numerical_cols, max_selections=4, key="pair_cols") | |
| if len(selected_cols) >= 2: | |
| fig = sns.pairplot(df[selected_cols].dropna()) | |
| st.pyplot(fig) | |
| img_bytes = BytesIO() | |
| fig.savefig(img_bytes, format='png', bbox_inches='tight') | |
| img_bytes.seek(0) | |
| st.download_button(label="Download Pair Plot", data=img_bytes, | |
| file_name=f"pairplot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", | |
| mime="image/png", key=f"pairplot_{datetime.now().strftime('%H%M%S')}") | |
| plt.close() | |
| with tab3: | |
| st.markdown("### Data Quality Checks") | |
| # Missing Values | |
| missing_total = df.isna().sum().sum() | |
| if missing_total > 0: | |
| st.warning(f"**Missing Values**: {missing_total} across {df.isna().any().sum()} columns.") | |
| missing_df = pd.DataFrame({ | |
| "Column": df.columns, | |
| "Missing Count": df.isna().sum(), | |
| "Missing %": (df.isna().sum() / len(df) * 100).round(2) | |
| }) | |
| missing_df = missing_df[missing_df["Missing Count"] > 0] | |
| st.dataframe(missing_df) | |
| else: | |
| st.success("No missing values detected.") | |
| # Duplicates | |
| duplicates = df.duplicated().sum() | |
| if duplicates > 0: | |
| st.warning(f"**Duplicates**: {duplicates} duplicate rows ({duplicates / len(df) * 100:.2f}%).") | |
| else: | |
| st.success("No duplicate rows detected.") | |
| # Outliers (Numerical) | |
| if numerical_cols.size > 0: | |
| outlier_summary = [] | |
| for col in numerical_cols: | |
| Q1, Q3 = df[col].quantile([0.25, 0.75]) | |
| IQR = Q3 - Q1 | |
| outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum() | |
| if outliers > 0: | |
| outlier_summary.append({"Column": col, "Outlier Count": outliers, | |
| "Outlier %": (outliers / len(df) * 100).round(2)}) | |
| if outlier_summary: | |
| st.warning("**Outliers Detected**:") | |
| st.dataframe(pd.DataFrame(outlier_summary)) | |
| else: | |
| st.success("No outliers detected in numerical columns.") | |
| with tab4: | |
| st.markdown("### Column Deep Dive") | |
| selected_col = st.selectbox("Select Column for Detailed Analysis", df.columns, key="deep_dive_col") | |
| st.write(f"**Column**: {selected_col}") | |
| st.write(f"**Data Type**: {df[selected_col].dtype}") | |
| st.write(f"**Missing Values**: {df[selected_col].isna().sum()} ({df[selected_col].isna().sum() / len(df) * 100:.2f}%)") | |
| st.write(f"**Unique Values**: {df[selected_col].nunique()} ({df[selected_col].nunique() / len(df) * 100:.2f}%)") | |
| if pd.api.types.is_numeric_dtype(df[selected_col]): | |
| st.write("**Summary Statistics**:") | |
| stats = df[selected_col].describe().round(2) | |
| stats['Skewness'] = df[selected_col].skew().round(2) | |
| stats['Kurtosis'] = df[selected_col].kurt().round(2) | |
| st.dataframe(stats) | |
| fig = px.histogram(df, x=selected_col, nbins=30, title=f"Distribution of {selected_col}") | |
| st.plotly_chart(fig) | |
| elif pd.api.types.is_object_dtype(df[selected_col]) or pd.api.types.is_categorical_dtype(df[selected_col]): | |
| st.write("**Top 5 Values**:") | |
| value_counts = df[selected_col].value_counts().head(5) | |
| st.dataframe(pd.DataFrame({ | |
| "Value": value_counts.index, | |
| "Count": value_counts.values, | |
| "% of Total": (value_counts.values / len(df) * 100).round(2) | |
| })) | |
| fig = px.bar(x=value_counts.index, y=value_counts.values, title=f"Top Values in {selected_col}") | |
| st.plotly_chart(fig) | |
| def download_image(fig, key_prefix): | |
| """ | |
| Utility function to download a Matplotlib figure as PNG. | |
| """ | |
| img_bytes = BytesIO() | |
| fig.savefig(img_bytes, format='png', bbox_inches='tight') | |
| img_bytes.seek(0) | |
| st.download_button(label="Download Image", data=img_bytes, | |
| file_name=f"{key_prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png", | |
| mime="image/png", key=f"download_{key_prefix}_{datetime.now().strftime('%H%M%S')}") | |
| # Feature Importance Analysis | |
| def feature_importance_analyzer(df): | |
| st.subheader("Feature Importance Analyzer") | |
| target_column = st.selectbox("Select Target Column", df.columns) | |
| feature_columns = [col for col in df.columns if col != target_column] | |
| if not feature_columns: | |
| st.warning("No features available.") | |
| return | |
| X = pd.get_dummies(df[feature_columns], drop_first=True) | |
| y = df[target_column] | |
| if y.dtype in ['object', 'category']: | |
| y = LabelEncoder().fit_transform(y) | |
| try: | |
| model = RandomForestClassifier(random_state=42) if y.nunique() <= 10 else RandomForestRegressor(random_state=42) | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| model.fit(X_train, y_train) | |
| importance_df = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_}).sort_values(by="Importance", ascending=False) | |
| st.write("Feature Importances:", importance_df) | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(data=importance_df, x="Importance", y="Feature", palette="viridis", ax=ax) | |
| st.pyplot(fig) | |
| download_image(fig, "feature_importance") | |
| plt.close(fig) | |
| except Exception as e: | |
| st.error(f"Error analyzing features: {str(e)}") | |
| #Best Parameter Selector | |
| def best_parameter_selector(df): | |
| st.subheader("Best Parameter Selector") | |
| task_type = st.selectbox("Select Task Type", ["Classification", "Regression"]) | |
| target_column = st.selectbox("Select Target Column", df.columns) | |
| feature_columns = [col for col in df.columns if col != target_column] | |
| if not feature_columns: | |
| st.warning("No features available.") | |
| return | |
| X = pd.get_dummies(df[feature_columns], drop_first=True) | |
| y = df[target_column] | |
| if task_type == "Classification" and y.dtype in ['object', 'category']: | |
| y = LabelEncoder().fit_transform(y) | |
| model_options = { | |
| "Classification": { | |
| "Logistic Regression": (LogisticRegression, {"C": [0.01, 0.1, 1], "max_iter": [100, 200]}), | |
| "Random Forest": (RandomForestClassifier, {"n_estimators": [50, 100], "max_depth": [None, 10]}), | |
| "SVM": (SVC, {"C": [0.1, 1], "kernel": ["rbf", "linear"]}) | |
| }, | |
| "Regression": { | |
| "Linear Regression": (LinearRegression, {}), | |
| "Random Forest": (RandomForestRegressor, {"n_estimators": [50, 100], "max_depth": [None, 10]}), | |
| "SVR": (SVR, {"C": [0.1, 1], "epsilon": [0.1, 0.2]}) | |
| } | |
| } | |
| model_name = st.selectbox("Select Model", list(model_options[task_type].keys())) | |
| model_class, param_grid = model_options[task_type][model_name] | |
| model = model_class(random_state=42) if "random_state" in model_class.__init__.__code__.co_varnames else model_class() | |
| for param, values in param_grid.items(): | |
| new_values = st.text_input(f"Values for {param} (comma-separated)", ",".join(map(str, values)) if values else "") | |
| if new_values: | |
| param_grid[param] = [float(x) if '.' in x else int(x) for x in new_values.split(',')] | |
| scoring = st.selectbox("Select Scoring Metric", ["accuracy", "f1"] if task_type == "Classification" else ["r2", "neg_mean_squared_error"]) | |
| try: | |
| if param_grid: | |
| with st.spinner("Performing GridSearchCV..."): | |
| grid_search = GridSearchCV(model, param_grid, cv=3, scoring=scoring, n_jobs=-1) | |
| grid_search.fit(X, y) | |
| st.write("Best Parameters:", grid_search.best_params_) | |
| st.write("Best Score:", grid_search.best_score_) | |
| else: | |
| model.fit(X, y) | |
| st.write("Model trained with default parameters. Score:", model.score(X, y)) | |
| except Exception as e: | |
| st.error(f"Parameter selection error: {str(e)}") | |
| # Select ML Models | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder | |
| from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV | |
| from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso | |
| from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, ExtraTreesClassifier, ExtraTreesRegressor | |
| from sklearn.svm import SVC, SVR | |
| from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
| from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS | |
| from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, | |
| confusion_matrix, mean_absolute_error, mean_squared_error, r2_score, | |
| mean_absolute_percentage_error, silhouette_score, davies_bouldin_score, | |
| calinski_harabasz_score, adjusted_rand_score, v_measure_score) | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.impute import SimpleImputer | |
| from io import BytesIO | |
| import pickle | |
| import warnings | |
| warnings.filterwarnings("ignore", category=UserWarning) # Suppress sklearn warnings | |
| # Time series imports | |
| try: | |
| from statsmodels.tsa.arima.model import ARIMA | |
| from statsmodels.tsa.holtwinters import ExponentialSmoothing | |
| from prophet import Prophet | |
| HAS_STATSMODELS = True | |
| HAS_PROPHET = True | |
| except ImportError: | |
| HAS_STATSMODELS = False | |
| HAS_PROPHET = False | |
| def select_ml_models(df): | |
| """ | |
| Builds an end-to-end ML workflow for classification, regression, clustering, or time series forecasting. | |
| Includes preprocessing, model training, evaluation, hyperparameter tuning, and model saving. | |
| Args: | |
| df (pd.DataFrame): Input dataset as a pandas DataFrame. | |
| """ | |
| st.subheader("Machine Learning Workflow") | |
| # Select ML task | |
| analysis_type = st.selectbox("Select Machine Learning Task", | |
| ["Classification", "Regression", "Clustering", "Time Series Forecasting"], | |
| help="Choose the type of ML task to perform.") | |
| # Preprocessing function | |
| def preprocess_data(df, target_col=None, task_type=None): | |
| """ | |
| Preprocesses the dataset based on the ML task. | |
| Args: | |
| df (pd.DataFrame): Input dataset. | |
| target_col (str): Target column name (None for clustering). | |
| task_type (str): Type of ML task. | |
| Returns: | |
| tuple: Preprocessed features (X), target (y), and preprocessor (if applicable). | |
| """ | |
| if target_col: | |
| X = df.drop(columns=[target_col]) | |
| y = df[target_col] | |
| else: | |
| X = df | |
| y = None | |
| # Identify numerical and categorical columns | |
| numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist() | |
| categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist() | |
| # Define preprocessing pipeline | |
| numerical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='median')), | |
| ('scaler', StandardScaler()) | |
| ]) | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), | |
| ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)) | |
| ]) | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ('num', numerical_transformer, numerical_cols), | |
| ('cat', categorical_transformer, categorical_cols) | |
| ]) | |
| if task_type in ["Classification", "Regression"]: | |
| # Fit and transform features | |
| X_processed = preprocessor.fit_transform(X) | |
| # Handle target for classification | |
| if task_type == "Classification" and y.dtype in ['object', 'category']: | |
| le = LabelEncoder() | |
| y = le.fit_transform(y) | |
| return X_processed, y, preprocessor, le | |
| return X_processed, y, preprocessor, None | |
| elif task_type == "Clustering": | |
| X_processed = preprocessor.fit_transform(X) | |
| return X_processed, None, preprocessor, None | |
| elif task_type == "Time Series Forecasting": | |
| # Time series requires minimal preprocessing here | |
| return X, y, None, None | |
| # Model evaluation function | |
| def evaluate_model(model, X_test, y_test, task_type, y_pred=None): | |
| """ | |
| Evaluates the model using task-specific metrics. | |
| Args: | |
| model: Trained model. | |
| X_test: Test features. | |
| y_test: Test target. | |
| task_type: Type of ML task. | |
| y_pred: Predicted values (optional, computed if None). | |
| Returns: | |
| dict: Evaluation metrics. | |
| """ | |
| if y_pred is None: | |
| y_pred = model.predict(X_test) if task_type != "Time Series Forecasting" else model.forecast(len(y_test)) | |
| metrics = {} | |
| if task_type == "Classification": | |
| metrics.update({ | |
| "Accuracy": accuracy_score(y_test, y_pred), | |
| "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "ROC AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1], multi_class='ovr') if hasattr(model, "predict_proba") else np.nan | |
| }) | |
| # Confusion matrix | |
| cm = confusion_matrix(y_test, y_pred) | |
| fig, ax = plt.subplots() | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) | |
| ax.set_title("Confusion Matrix") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("True") | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| elif task_type == "Regression": | |
| metrics.update({ | |
| "MAE": mean_absolute_error(y_test, y_pred), | |
| "MSE": mean_squared_error(y_test, y_pred), | |
| "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)), | |
| "Rยฒ": r2_score(y_test, y_pred), | |
| "MAPE": mean_absolute_percentage_error(y_test, y_pred) * 100 | |
| }) | |
| # Scatter plot of predictions | |
| fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'True Values', 'y': 'Predicted Values'}, | |
| title="True vs Predicted Values") | |
| fig.add_scatter(x=y_test, y=y_test, mode='lines', name='Ideal') | |
| st.plotly_chart(fig) | |
| elif task_type == "Clustering": | |
| metrics.update({ | |
| "Silhouette Score": silhouette_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan, | |
| "Davies-Bouldin Index": davies_bouldin_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan, | |
| "Calinski-Harabasz Score": calinski_harabasz_score(X_test, y_pred) if len(np.unique(y_pred)) > 1 else np.nan | |
| }) | |
| # Visualize clusters (if 2D or reducible) | |
| if X_test.shape[1] == 2: | |
| fig = px.scatter(x=X_test[:, 0], y=X_test[:, 1], color=y_pred.astype(str), | |
| title="Cluster Visualization", labels={'x': 'Feature 1', 'y': 'Feature 2'}) | |
| st.plotly_chart(fig) | |
| elif task_type == "Time Series Forecasting": | |
| metrics.update({ | |
| "MAE": mean_absolute_error(y_test, y_pred), | |
| "MSE": mean_squared_error(y_test, y_pred), | |
| "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)), | |
| "MAPE": mean_absolute_percentage_error(y_test, y_pred) * 100 | |
| }) | |
| return metrics | |
| # Hyperparameter tuning function | |
| def tune_model(model, X_train, y_train, task_type, model_name): | |
| """ | |
| Performs hyperparameter tuning using RandomizedSearchCV. | |
| Args: | |
| model: Model to tune. | |
| X_train: Training features. | |
| y_train: Training target. | |
| task_type: Type of ML task. | |
| model_name: Name of the model. | |
| Returns: | |
| tuple: Best model and parameters. | |
| """ | |
| param_grids = { | |
| "Logistic Regression": {"C": np.logspace(-3, 3, 10), "penalty": ['l2'], "max_iter": [1000]}, | |
| "Random Forest Classifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20], "min_samples_split": [2, 5]}, | |
| "SVM Classifier": {"C": [0.1, 1, 10], "kernel": ['rbf', 'linear']}, | |
| "KNN Classifier": {"n_neighbors": [3, 5, 7, 9], "weights": ['uniform', 'distance']}, | |
| "Gradient Boosting Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}, | |
| "Naive Bayes": {}, # No tuning for Naive Bayes | |
| "AdaBoost Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, | |
| "Extra Trees Classifier": {"n_estimators": [50, 100], "max_depth": [None, 10, 20]}, | |
| "Linear Regression": {}, # No tuning for basic Linear Regression | |
| "Ridge Regression": {"alpha": [0.1, 1, 10, 100]}, | |
| "Lasso Regression": {"alpha": [0.1, 1, 10, 100]}, | |
| "Random Forest Regressor": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}, | |
| "SVR": {"C": [0.1, 1, 10], "epsilon": [0.1, 0.2, 0.5]}, | |
| "KNN Regressor": {"n_neighbors": [3, 5, 7, 9], "weights": ['uniform', 'distance']}, | |
| "Gradient Boosting Regressor": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1], "max_depth": [3, 5]}, | |
| "AdaBoost Regressor": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, | |
| "Extra Trees Regressor": {"n_estimators": [50, 100], "max_depth": [None, 10, 20]}, | |
| "K-Means": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, | |
| "DBSCAN": {"eps": [0.1, 0.5, 1.0], "min_samples": [3, 5, 10]}, | |
| "Agglomerative Clustering": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, | |
| "Spectral Clustering": {"n_clusters": [2, 3, 4, 5, 6, 7, 8]}, | |
| "OPTICS": {"min_samples": [3, 5, 10], "xi": [0.05, 0.1]}, | |
| "ARIMA": {"order": [(1,1,1), (2,1,1), (1,1,2)]}, | |
| "Exponential Smoothing": {"trend": ["add", None], "seasonal": ["add", None]}, | |
| "Prophet": {"changepoint_prior_scale": [0.01, 0.05, 0.1], "seasonality_prior_scale": [5, 10, 15]} | |
| } | |
| if model_name in param_grids and param_grids[model_name]: | |
| search = RandomizedSearchCV(model, param_distributions=param_grids[model_name], | |
| n_iter=10, cv=3, scoring='accuracy' if task_type == "Classification" else 'r2', | |
| n_jobs=-1, random_state=42) | |
| search.fit(X_train, y_train) | |
| return search.best_estimator_, search.best_params_ | |
| return model, {} | |
| # Model saving function | |
| def save_model(model, model_name): | |
| """ | |
| Serializes and offers the model for download. | |
| Args: | |
| model: Trained model. | |
| model_name: Name of the model. | |
| Returns: | |
| BytesIO: Serialized model file. | |
| """ | |
| model_file = BytesIO() | |
| pickle.dump(model, model_file) | |
| model_file.seek(0) | |
| return model_file | |
| # Classification and Regression | |
| if analysis_type in ["Classification", "Regression"]: | |
| st.markdown("### Configure Model") | |
| target_col = st.selectbox("Select Target Variable", df.columns, key="target_col") | |
| feature_cols = st.multiselect("Select Feature Columns", [col for col in df.columns if col != target_col], | |
| key="feature_cols") | |
| if not feature_cols: | |
| st.warning("Please select at least one feature column.") | |
| return | |
| # Validate target | |
| if analysis_type == "Classification": | |
| if pd.api.types.is_numeric_dtype(df[target_col]) and df[target_col].nunique() > len(df) // 10: | |
| st.error("Target appears continuous. Consider binning or switching to Regression.") | |
| return | |
| elif analysis_type == "Regression": | |
| if not pd.api.types.is_numeric_dtype(df[target_col]): | |
| st.error("Target must be numeric for regression.") | |
| return | |
| # Preprocess data | |
| try: | |
| X, y, preprocessor, le = preprocess_data(df[feature_cols + [target_col]], target_col, analysis_type) | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| except Exception as e: | |
| st.error(f"Preprocessing error: {str(e)}") | |
| return | |
| # Model options | |
| model_options = { | |
| "Classification": { | |
| "Logistic Regression": LogisticRegression(random_state=42), | |
| "Random Forest Classifier": RandomForestClassifier(random_state=42), | |
| "SVM Classifier": SVC(random_state=42, probability=True), | |
| "KNN Classifier": KNeighborsClassifier(), | |
| "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42), | |
| "Naive Bayes": GaussianNB(), | |
| "AdaBoost Classifier": AdaBoostClassifier(random_state=42), | |
| "Extra Trees Classifier": ExtraTreesClassifier(random_state=42) | |
| }, | |
| "Regression": { | |
| "Linear Regression": LinearRegression(), | |
| "Ridge Regression": Ridge(random_state=42), | |
| "Lasso Regression": Lasso(random_state=42), | |
| "Random Forest Regressor": RandomForestRegressor(random_state=42), | |
| "SVR": SVR(), | |
| "KNN Regressor": KNeighborsRegressor(), | |
| "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42), | |
| "AdaBoost Regressor": AdaBoostRegressor(random_state=42), | |
| "Extra Trees Regressor": ExtraTreesRegressor(random_state=42) | |
| } | |
| }[analysis_type] | |
| selected_model_name = st.selectbox("Select Model", list(model_options.keys()), key="model_select") | |
| model = model_options[selected_model_name] | |
| # Train model | |
| if st.button("Train Model", key="train_button"): | |
| with st.spinner("Training model..."): | |
| try: | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| metrics = evaluate_model(model, X_test, y_test, analysis_type, y_pred) | |
| st.markdown("### Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| # Cross-validation | |
| cv_scores = cross_val_score(model, X, y, cv=5, | |
| scoring='accuracy' if analysis_type == "Classification" else 'r2') | |
| st.write(f"**Cross-Validation Scores** (5-fold): Mean = {cv_scores.mean():.4f}, Std = {cv_scores.std():.4f}") | |
| # Store model in session state | |
| st.session_state['trained_model'] = model | |
| st.session_state['model_name'] = selected_model_name | |
| except Exception as e: | |
| st.error(f"Training error: {str(e)}") | |
| return | |
| # Hyperparameter tuning | |
| if st.button("Perform Hyperparameter Tuning", key="tune_button"): | |
| with st.spinner("Tuning hyperparameters..."): | |
| try: | |
| tuned_model, best_params = tune_model(model, X_train, y_train, analysis_type, selected_model_name) | |
| tuned_model.fit(X_train, y_train) | |
| y_pred = tuned_model.predict(X_test) | |
| metrics = evaluate_model(tuned_model, X_test, y_test, analysis_type, y_pred) | |
| st.markdown("### Tuned Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| st.write("**Best Hyperparameters**:", best_params) | |
| # Update stored model | |
| st.session_state['trained_model'] = tuned_model | |
| st.session_state['model_name'] = selected_model_name + "_Tuned" | |
| except Exception as e: | |
| st.error(f"Tuning error: {str(e)}") | |
| # Save model | |
| if 'trained_model' in st.session_state and st.button("Save The Model", key="save_button"): | |
| model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) | |
| st.download_button( | |
| label="Download Model", | |
| data=model_file, | |
| file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", | |
| mime="application/octet-stream", | |
| key="download_model" | |
| ) | |
| # Clustering | |
| elif analysis_type == "Clustering": | |
| st.markdown("### Configure Clustering") | |
| feature_cols = st.multiselect("Select Features for Clustering", df.columns, key="cluster_cols") | |
| if not feature_cols: | |
| st.warning("Please select at least one feature column.") | |
| return | |
| # Preprocess data | |
| try: | |
| X, _, preprocessor, _ = preprocess_data(df[feature_cols], task_type="Clustering") | |
| except Exception as e: | |
| st.error(f"Preprocessing error: {str(e)}") | |
| return | |
| n_clusters = st.slider("Number of Clusters (for applicable algorithms)", 2, 10, 3, key="n_clusters") | |
| clustering_models = { | |
| "K-Means": KMeans(n_clusters=n_clusters, random_state=42), | |
| "DBSCAN": DBSCAN(eps=0.5, min_samples=5), | |
| "Agglomerative Clustering": AgglomerativeClustering(n_clusters=n_clusters), | |
| "Spectral Clustering": SpectralClustering(n_clusters=n_clusters, random_state=42), | |
| "OPTICS": OPTICS(min_samples=5) | |
| } | |
| selected_model_name = st.selectbox("Select Clustering Algorithm", list(clustering_models.keys()), | |
| key="cluster_model_select") | |
| model = clustering_models[selected_model_name] | |
| # Train model | |
| if st.button("Perform Clustering", key="cluster_button"): | |
| with st.spinner("Performing clustering..."): | |
| try: | |
| clusters = model.fit_predict(X) | |
| df_with_clusters = df.copy() | |
| df_with_clusters['Cluster'] = clusters | |
| st.markdown("### Clustered Data Sample") | |
| st.dataframe(df_with_clusters.head()) | |
| # Evaluate clustering | |
| metrics = evaluate_model(model, X, clusters, "Clustering", clusters) | |
| st.markdown("### Clustering Metrics") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| # Store model | |
| st.session_state['trained_model'] = model | |
| st.session_state['model_name'] = selected_model_name | |
| except Exception as e: | |
| st.error(f"Clustering error: {str(e)}") | |
| return | |
| # Hyperparameter tuning | |
| if st.button("Perform Hyperparameter Tuning", key="cluster_tune_button"): | |
| with st.spinner("Tuning hyperparameters..."): | |
| try: | |
| tuned_model, best_params = tune_model(model, X, clusters, "Clustering", selected_model_name) | |
| clusters = tuned_model.fit_predict(X) | |
| df_with_clusters = df.copy() | |
| df_with_clusters['Cluster'] = clusters | |
| st.markdown("### Tuned Clustered Data Sample") | |
| st.dataframe(df_with_clusters.head()) | |
| metrics = evaluate_model(tuned_model, X, clusters, "Clustering", clusters) | |
| st.markdown("### Tuned Clustering Metrics") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| st.write("**Best Hyperparameters**:", best_params) | |
| # Update stored model | |
| st.session_state['trained_model'] = tuned_model | |
| st.session_state['model_name'] = selected_model_name + "_Tuned" | |
| except Exception as e: | |
| st.error(f"Tuning error: {str(e)}") | |
| # Save model | |
| if 'trained_model' in st.session_state and st.button("Save The Model", key="cluster_save_button"): | |
| model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) | |
| st.download_button( | |
| label="Download Model", | |
| data=model_file, | |
| file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", | |
| mime="application/octet-stream", | |
| key="download_cluster_model" | |
| ) | |
| # Time Series Forecasting | |
| elif analysis_type == "Time Series Forecasting": | |
| if not (HAS_STATSMODELS or HAS_PROPHET): | |
| st.error("Please install statsmodels or prophet: `pip install statsmodels fbprophet`") | |
| return | |
| st.markdown("### Configure Time Series Forecasting") | |
| datetime_cols = df.select_dtypes(include=['datetime64']).columns | |
| if datetime_cols.empty: | |
| st.error("No datetime columns found. Please ensure a datetime column exists.") | |
| return | |
| date_col = st.selectbox("Select Date Column", datetime_cols, key="date_col") | |
| value_col = st.selectbox("Select Value Column", | |
| df.select_dtypes(include=['float64', 'int64']).columns, | |
| key="value_col") | |
| # Prepare data | |
| try: | |
| ts_df = df[[date_col, value_col]].sort_values(date_col).dropna() | |
| ts_df[date_col] = pd.to_datetime(ts_df[date_col]) | |
| except Exception as e: | |
| st.error(f"Data preparation error: {str(e)}") | |
| return | |
| # Time-based split | |
| train_size = st.slider("Training Data Proportion (%)", 50, 95, 80, key="train_size") | |
| train_size = int(len(ts_df) * (train_size / 100)) | |
| train, test = ts_df[:train_size], ts_df[train_size:] | |
| # Model options | |
| forecast_models = {} | |
| if HAS_STATSMODELS: | |
| forecast_models.update({ | |
| "ARIMA": lambda data: ARIMA(data[value_col], order=(1,1,1)).fit(), | |
| "Exponential Smoothing": lambda data: ExponentialSmoothing( | |
| data[value_col], trend='add', seasonal='add', seasonal_periods=12 | |
| ).fit() | |
| }) | |
| if HAS_PROPHET: | |
| forecast_models["Prophet"] = lambda data: Prophet().fit( | |
| data.rename(columns={date_col: 'ds', value_col: 'y'}) | |
| ) | |
| selected_model_name = st.selectbox("Select Forecasting Model", list(forecast_models.keys()), | |
| key="ts_model_select") | |
| # Train model | |
| if st.button("Train Model", key="ts_train_button"): | |
| with st.spinner("Training time series model..."): | |
| try: | |
| if selected_model_name == "Prophet": | |
| model = forecast_models[selected_model_name](train) | |
| future = model.make_future_dataframe(periods=len(test)) | |
| forecast = model.predict(future) | |
| y_pred = forecast['yhat'][-len(test):].values | |
| y_test = test[value_col].values | |
| else: | |
| model = forecast_models[selected_model_name](train) | |
| y_pred = model.forecast(steps=len(test)) | |
| y_test = test[value_col].values | |
| # Evaluate | |
| metrics = evaluate_model(model, test, y_test, "Time Series Forecasting", y_pred) | |
| st.markdown("### Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| # Plot forecast | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| ax.plot(train[date_col], train[value_col], label="Train") | |
| ax.plot(test[date_col], test[value_col], label="Test") | |
| ax.plot(test[date_col], y_pred, label="Forecast") | |
| ax.set_title(f"{selected_model_name} Forecast") | |
| ax.legend() | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| # Store model | |
| st.session_state['trained_model'] = model | |
| st.session_state['model_name'] = selected_model_name | |
| except Exception as e: | |
| st.error(f"Training error: {str(e)}") | |
| return | |
| # Hyperparameter tuning | |
| if st.button("Perform Hyperparameter Tuning", key="ts_tune_button"): | |
| with st.spinner("Tuning hyperparameters..."): | |
| try: | |
| if selected_model_name == "Prophet": | |
| param_grid = { | |
| "changepoint_prior_scale": [0.01, 0.05, 0.1], | |
| "seasonality_prior_scale": [5, 10, 15] | |
| } | |
| best_score = float('inf') | |
| best_params = {} | |
| best_model = None | |
| for cps in param_grid["changepoint_prior_scale"]: | |
| for sps in param_grid["seasonality_prior_scale"]: | |
| model = Prophet(changepoint_prior_scale=cps, seasonality_prior_scale=sps) | |
| model.fit(train.rename(columns={date_col: 'ds', value_col: 'y'})) | |
| future = model.make_future_dataframe(periods=len(test)) | |
| forecast = model.predict(future) | |
| score = mean_squared_error(test[value_col], forecast['yhat'][-len(test):]) | |
| if score < best_score: | |
| best_score = score | |
| best_params = {"changepoint_prior_scale": cps, "seasonality_prior_scale": sps} | |
| best_model = model | |
| model = best_model | |
| else: | |
| model, best_params = tune_model(model, train[value_col], None, | |
| "Time Series Forecasting", selected_model_name) | |
| # Re-evaluate | |
| if selected_model_name == "Prophet": | |
| future = model.make_future_dataframe(periods=len(test)) | |
| forecast = model.predict(future) | |
| y_pred = forecast['yhat'][-len(test):].values | |
| y_test = test[value_col].values | |
| else: | |
| model = forecast_models[selected_model_name](train) | |
| y_pred = model.forecast(steps=len(test)) | |
| y_test = test[value_col].values | |
| metrics = evaluate_model(model, test, y_test, "Time Series Forecasting", y_pred) | |
| st.markdown("### Tuned Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| st.write("**Best Hyperparameters**:", best_params) | |
| # Plot tuned forecast | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| ax.plot(train[date_col], train[value_col], label="Train") | |
| ax.plot(test[date_col], test[value_col], label="Test") | |
| ax.plot(test[date_col], y_pred, label="Tuned Forecast") | |
| ax.set_title(f"Tuned {selected_model_name} Forecast") | |
| ax.legend() | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| # Update stored model | |
| st.session_state['trained_model'] = model | |
| st.session_state['model_name'] = selected_model_name + "_Tuned" | |
| except Exception as e: | |
| st.error(f"Tuning error: {str(e)}") | |
| # Save model | |
| if 'trained_model' in st.session_state and st.button("Save The Model", key="ts_save_button"): | |
| model_file = save_model(st.session_state['trained_model'], st.session_state['model_name']) | |
| st.download_button( | |
| label="Download Model", | |
| data=model_file, | |
| file_name=f"{st.session_state['model_name'].replace(' ', '_').lower()}_model.pkl", | |
| mime="application/octet-stream", | |
| key="download_ts_model" | |
| ) | |
| # Clear Modified Dataset | |
| def clear_modified_dataset(): | |
| st.subheader("Clear Modified Dataset") | |
| st.session_state.pop('uploaded_df', None) | |
| st.write("Dataset cleared.") | |
| # Chat with the dataset | |
| def chat_with_dataset(df): | |
| st.subheader("Chat with Your Dataset") | |
| st.write("Ask questions about your dataset. For example, 'What is the average value of column X?' or 'Show me the top 5 rows.'") | |
| user_query = st.text_area("Enter your query:", height=100) | |
| if st.button("Ask"): | |
| if not user_query.strip(): | |
| st.warning("Please enter a query.") | |
| return | |
| try: | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are an expert data analyst. Answer the user's questions about the provided pandas DataFrame. " | |
| "Use Python pandas to analyze the data and provide concise answers. " | |
| "If the user asks for code, generate Python code snippets using pandas to perform the requested operation. " | |
| "Do not include explanations unless explicitly requested." | |
| ), | |
| }, | |
| {"role": "user", "content": f"The dataset is:\n{df.head(5).to_string()}\n\n{user_query}"}, | |
| ], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| response = chat_completion.choices[0].message.content.strip() | |
| st.write("Response:") | |
| st.code(response, language="python" if "def " in response or "import " in response else None) | |
| st.write("You can execute the generated code below:") | |
| if st.button("Execute Generated Code"): | |
| try: | |
| safe_globals = {"pd": pd, "plt": plt, "sns": sns, "df": df, "io": io, "np": np} | |
| safe_locals = {} | |
| exec(response, safe_globals, safe_locals) | |
| # Check for matplotlib or seaborn plots | |
| if "plt." in response or "sns." in response: | |
| st.pyplot(plt.gcf()) | |
| plt.clf() | |
| # Check for DataFrame outputs | |
| elif "pd.DataFrame" in response or "df" in response: | |
| output_df = safe_locals.get("df", None) | |
| if isinstance(output_df, pd.DataFrame): | |
| st.write("Generated DataFrame:") | |
| st.dataframe(output_df) | |
| else: | |
| st.write("Code executed successfully. Check the output above if applicable.") | |
| else: | |
| st.write("Code executed successfully. Check the output above if applicable.") | |
| except Exception as e: | |
| st.error(f"Error executing code: {str(e)}") | |
| except Exception as e: | |
| st.error(f"Error with Groq API: {str(e)}") | |
| # NLP Pipeline | |
| def nlp_pipeline_tab(): | |
| """Function for the NLP Pipeline tab.""" | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier | |
| from sklearn.svm import SVC | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, | |
| roc_auc_score, confusion_matrix) | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from gensim import corpora | |
| from gensim.models import LdaModel | |
| import spacy | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| from io import BytesIO | |
| import pickle | |
| st.header("NLP Pipeline") | |
| st.markdown("Perform Natural Language Processing tasks like text classification, sentiment analysis, topic modeling, and named entity recognition.") | |
| # Access uploaded dataset from session state | |
| if 'uploaded_df' not in st.session_state: | |
| st.error("No dataset uploaded. Please upload a CSV file in the main DataGenie app.") | |
| return | |
| df = st.session_state['uploaded_df'] | |
| st.write("Dataset Preview:") | |
| st.dataframe(df.head()) | |
| # Select NLP task | |
| nlp_task = st.selectbox("Select NLP Task", | |
| ["Text Classification", "Sentiment Analysis", "Topic Modeling", "Named Entity Recognition"], | |
| help="Choose the specific NLP task to perform.") | |
| # Preprocessing function | |
| def preprocess_nlp_data(df, text_col, target_col=None, task_type=None): | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| def clean_text(text): | |
| if not isinstance(text, str): | |
| return "" | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()] | |
| tokens = [token for token in tokens if token not in stop_words] | |
| return " ".join(tokens) | |
| X_text = df[text_col].apply(clean_text) | |
| if task_type in ["Text Classification", "Sentiment Analysis"]: | |
| vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) | |
| X = vectorizer.fit_transform(X_text).toarray() | |
| y = df[target_col] if target_col else None | |
| if task_type == "Text Classification" and y.dtype in ['object', 'category']: | |
| le = LabelEncoder() | |
| y = le.fit_transform(y) | |
| return X, y, vectorizer, le | |
| return X, y, vectorizer, None | |
| elif task_type == "Topic Modeling": | |
| texts = [text.split() for text in X_text] | |
| dictionary = corpora.Dictionary(texts) | |
| corpus = [dictionary.doc2bow(text) for text in texts] | |
| return corpus, None, dictionary, None | |
| elif task_type == "Named Entity Recognition": | |
| return X_text, None, None, None | |
| # Evaluation function | |
| def evaluate_model(model, X_test, y_test, task_type, y_pred=None, vectorizer=None): | |
| metrics = {} | |
| if task_type in ["Text Classification", "Sentiment Analysis"]: | |
| if y_pred is None: | |
| y_pred = model.predict(X_test) | |
| metrics.update({ | |
| "Accuracy": accuracy_score(y_test, y_pred), | |
| "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "Recall": recall_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "F1 Score": f1_score(y_test, y_pred, average='weighted', zero_division=0), | |
| "ROC AUC": roc_auc_score(y_test, model.predict_proba(X_test)[:, 1], multi_class='ovr') if hasattr(model, "predict_proba") else np.nan | |
| }) | |
| cm = confusion_matrix(y_test, y_pred) | |
| fig, ax = plt.subplots() | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax) | |
| ax.set_title("Confusion Matrix") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("True") | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| elif task_type == "Topic Modeling": | |
| metrics["Number of Topics"] = model.num_topics | |
| topics = model.print_topics(num_words=5) | |
| for topic_id, topic in topics: | |
| st.write(f"**Topic {topic_id}**: {topic}") | |
| return metrics | |
| # Hyperparameter tuning function | |
| def tune_model(model, X_train, y_train, task_type, model_name): | |
| param_grids = { | |
| "Logistic Regression": {"C": np.logspace(-3, 3, 10), "penalty": ['l2'], "max_iter": [1000]}, | |
| "Random Forest Classifier": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}, | |
| "SVM Classifier": {"C": [0.1, 1, 10], "kernel": ['rbf', 'linear']}, | |
| "Gradient Boosting Classifier": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]}, | |
| "Naive Bayes": {"alpha": [0.1, 0.5, 1.0]}, | |
| "LDA": {"num_topics": [5, 10, 15, 20]} | |
| } | |
| if model_name in param_grids and param_grids[model_name]: | |
| search = RandomizedSearchCV(model, param_distributions=param_grids[model_name], | |
| n_iter=10, cv=3, scoring='accuracy' if task_type in ["Text Classification", "Sentiment Analysis"] else None, | |
| n_jobs=-1, random_state=42) | |
| search.fit(X_train, y_train) | |
| return search.best_estimator_, search.best_params_ | |
| return model, {} | |
| # Model saving function | |
| def save_model(model, model_name): | |
| model_file = BytesIO() | |
| pickle.dump(model, model_file) | |
| model_file.seek(0) | |
| return model_file | |
| # Identify text columns | |
| text_cols = df.select_dtypes(include=['object']).columns.tolist() | |
| if not text_cols: | |
| st.error("No text columns found in the dataset.") | |
| return | |
| text_col = st.selectbox("Select Text Column", text_cols, key="nlp_text_col") | |
| # Text Classification and Sentiment Analysis | |
| if nlp_task in ["Text Classification", "Sentiment Analysis"]: | |
| target_col = st.selectbox("Select Target Variable", | |
| [col for col in df.columns if col != text_col], | |
| key="nlp_target_col") | |
| if nlp_task == "Sentiment Analysis" and df[target_col].nunique() > 10: | |
| st.error("Target has too many unique values for sentiment analysis.") | |
| return | |
| try: | |
| X, y, vectorizer, le = preprocess_nlp_data(df, text_col, target_col, nlp_task) | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| except Exception as e: | |
| st.error(f"Preprocessing error: {str(e)}") | |
| return | |
| model_options = { | |
| "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000), | |
| "Random Forest Classifier": RandomForestClassifier(random_state=42), | |
| "SVM Classifier": SVC(random_state=42, probability=True), | |
| "Naive Bayes": MultinomialNB(), | |
| "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42) | |
| } | |
| selected_model_name = st.selectbox("Select Model", list(model_options.keys()), key="nlp_model_select") | |
| model = model_options[selected_model_name] | |
| if st.button("Train Model", key="nlp_train_button"): | |
| with st.spinner("Training NLP model..."): | |
| try: | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_test) | |
| metrics = evaluate_model(model, X_test, y_test, nlp_task, y_pred, vectorizer) | |
| st.markdown("### Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') | |
| st.write(f"**Cross-Validation Scores** (5-fold): Mean = {cv_scores.mean():.4f}, Std = {cv_scores.std():.4f}") | |
| if selected_model_name in ["Random Forest Classifier", "Gradient Boosting Classifier"]: | |
| feature_names = vectorizer.get_feature_names_out() | |
| importance = model.feature_importances_ | |
| top_features = pd.DataFrame({"Feature": feature_names, "Importance": importance}).nlargest(10, "Importance") | |
| fig = px.bar(top_features, x="Feature", y="Importance", title="Top 10 Feature Importances") | |
| st.plotly_chart(fig) | |
| st.session_state['nlp_trained_model'] = model | |
| st.session_state['nlp_model_name'] = selected_model_name | |
| st.session_state['vectorizer'] = vectorizer | |
| st.session_state['label_encoder'] = le | |
| except Exception as e: | |
| st.error(f"Training error: {str(e)}") | |
| if st.button("Perform Hyperparameter Tuning", key="nlp_tune_button"): | |
| with st.spinner("Tuning hyperparameters..."): | |
| try: | |
| tuned_model, best_params = tune_model(model, X_train, y_train, nlp_task, selected_model_name) | |
| tuned_model.fit(X_train, y_train) | |
| y_pred = tuned_model.predict(X_test) | |
| metrics = evaluate_model(tuned_model, X_test, y_test, nlp_task, y_pred, vectorizer) | |
| st.markdown("### Tuned Model Performance") | |
| st.dataframe(pd.DataFrame(metrics.items(), columns=["Metric", "Value"]).round(4)) | |
| st.write("**Best Hyperparameters**:", best_params) | |
| st.session_state['nlp_trained_model'] = tuned_model | |
| st.session_state['nlp_model_name'] = selected_model_name + "_Tuned" | |
| except Exception as e: | |
| st.error(f"Tuning error: {str(e)}") | |
| if 'nlp_trained_model' in st.session_state and st.button("Save The Model", key="nlp_save_button"): | |
| model_file = save_model(st.session_state['nlp_trained_model'], st.session_state['nlp_model_name']) | |
| st.download_button( | |
| label="Download Model", | |
| data=model_file, | |
| file_name=f"{st.session_state['nlp_model_name'].replace(' ', '_').lower()}_model.pkl", | |
| mime="application/octet-stream", | |
| key="download_nlp_model" | |
| ) | |
| # Topic Modeling | |
| elif nlp_task == "Topic Modeling": | |
| try: | |
| corpus, _, dictionary, _ = preprocess_nlp_data(df, text_col, task_type="Topic Modeling") | |
| except Exception as e: | |
| st.error(f"Preprocessing error: {str(e)}") | |
| return | |
| num_topics = st.slider("Number of Topics", 2, 20, 5, key="num_topics") | |
| model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42) | |
| if st.button("Perform Topic Modeling", key="topic_button"): | |
| with st.spinner("Performing topic modeling..."): | |
| try: | |
| metrics = evaluate_model(model, corpus, None, "Topic Modeling") | |
| st.markdown("### Topic Modeling Results") | |
| st.write("**Topics Identified**:") | |
| for topic_id, topic in metrics.items(): | |
| if isinstance(topic, str): | |
| st.write(f"- {topic}") | |
| st.session_state['nlp_trained_model'] = model | |
| st.session_state['nlp_model_name'] = "LDA" | |
| except Exception as e: | |
| st.error(f"Topic modeling error: {str(e)}") | |
| if st.button("Perform Hyperparameter Tuning", key="topic_tune_button"): | |
| with st.spinner("Tuning topics..."): | |
| try: | |
| tuned_model, best_params = tune_model(model, corpus, None, "Topic Modeling", "LDA") | |
| metrics = evaluate_model(tuned_model, corpus, None, "Topic Modeling") | |
| st.markdown("### Tuned Topic Modeling Results") | |
| st.write("**Tuned Topics Identified**:") | |
| for topic_id, topic in metrics.items(): | |
| if isinstance(topic, str): | |
| st.write(f"- {topic}") | |
| st.write("**Best Hyperparameters**:", best_params) | |
| st.session_state['nlp_trained_model'] = tuned_model | |
| st.session_state['nlp_model_name'] = "LDA_Tuned" | |
| except Exception as e: | |
| st.error(f"Tuning error: {str(e)}") | |
| if 'nlp_trained_model' in st.session_state and st.button("Save The Model", key="topic_save_button"): | |
| model_file = save_model(st.session_state['nlp_trained_model'], st.session_state['nlp_model_name']) | |
| st.download_button( | |
| label="Download Model", | |
| data=model_file, | |
| file_name=f"{st.session_state['nlp_model_name'].replace(' ', '_').lower()}_model.pkl", | |
| mime="application/octet-stream", | |
| key="download_topic_model" | |
| ) | |
| # Named Entity Recognition | |
| elif nlp_task == "Named Entity Recognition": | |
| try: | |
| X_text, _, _, _ = preprocess_nlp_data(df, text_col, task_type="Named Entity Recognition") | |
| except Exception as e: | |
| st.error(f"Preprocessing error: {str(e)}") | |
| return | |
| nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat"]) | |
| if st.button("Perform NER", key="ner_button"): | |
| with st.spinner("Performing Named Entity Recognition..."): | |
| try: | |
| st.markdown("### NER Results") | |
| for text in X_text[:10]: | |
| doc = nlp(text) | |
| st.write(f"**Text**: {text[:100]}...") | |
| entities = [(ent.text, ent.label_) for ent in doc.ents] | |
| if entities: | |
| st.dataframe(pd.DataFrame(entities, columns=["Entity", "Label"])) | |
| else: | |
| st.write("No entities detected.") | |
| st.info("NER uses a pre-trained spaCy model, so no training or saving is required.") | |
| except Exception as e: | |
| st.error(f"NER error: {str(e)}") | |
| # Main app layout | |
| add_custom_styles() | |
| st.title("") | |
| add_header() | |
| tab1, tab2, tab3, tab4 = st.tabs(["Dataset Generator", "Example Prompts", "Chat with Dataset", "NLP Pipeline"]) | |
| with tab1: | |
| st.header("Generate Synthetic Datasets") | |
| st.write("Enter a prompt to generate a synthetic dataset. Be as descriptive as possible (e.g., 'Generate 500 rows for heart risk prediction with age, common symptoms like chest pain and shortness of breath, and a risk level (yes/no)'). For more examples, check the 'Example Prompts' tab.") | |
| prompt = st.text_area("Your prompt:", height=100) | |
| if "generated_code" not in st.session_state: | |
| st.session_state.generated_code = None | |
| st.session_state.expected_rows = None | |
| if st.button("Generate Code"): | |
| if prompt: | |
| code = generate_dataset_code(prompt) | |
| if code: | |
| st.session_state.generated_code = code | |
| st.session_state.expected_rows = extract_row_count(prompt) | |
| st.subheader("Generated Python Code") | |
| st.code(code, language="python") | |
| st.info("Review the code and click 'Get the Dataset'.") | |
| else: | |
| st.error("Generated code does not define create_dataset function correctly.") | |
| else: | |
| st.warning("Enter a prompt.") | |
| if st.session_state.generated_code and st.button("Get the Dataset"): | |
| df = execute_code(st.session_state.generated_code) | |
| if df is not None: | |
| if len(df) != st.session_state.expected_rows: | |
| st.warning(f"Dataset has {len(df)} rows; requested {st.session_state.expected_rows}.") | |
| st.subheader("Generated Dataset") | |
| st.write(f"Rows: {len(df)}, Columns: {', '.join(df.columns)}") | |
| st.dataframe(df.head()) | |
| csv_bytes = to_csv_bytes(df) | |
| st.download_button(label="Download CSV", data=csv_bytes, file_name="datagenie_dataset.csv", mime="text/csv") | |
| with tab2: | |
| st.header("Example Prompts") | |
| st.write("Explore example prompts to generate synthetic datasets for various domains.") | |
| st.subheader("๐ผ Finance & Business") | |
| st.write("Generate 1000 customer records for a bank with age, income, loan amount, credit score, and defaulted (Yes/No).") | |
| st.write("Create 500 rows of sales data with product category, region, sales amount, profit margin, and sales channel (Online/Offline).") | |
| st.write("Generate 200 rows of stock market data with date, opening price, closing price, highest price, lowest price, and trading volume.") | |
| st.subheader("๐งโ๐ Education") | |
| st.write("Create 700 student records with study hours, attendance, and final grade (A, B, C, D, F).") | |
| st.write("Generate 300 rows of teacher performance data with years of experience, subject taught, average student score, and teacher rating (1-5).") | |
| st.write("Generate 1000 rows of university admission data with applicant age, GPA, SAT score, extracurricular activities, and admission status (Accepted/Rejected).") | |
| st.subheader("๐ Environment") | |
| st.write("Generate 365 days of air quality data with PM2.5, PM10, CO2, and air quality (Good, Moderate, Hazardous).") | |
| st.write("Create 500 rows of weather data with date, temperature, humidity, wind speed, and precipitation level.") | |
| st.write("Generate 1000 rows of energy consumption data with household size, monthly usage (kWh), energy source (Solar, Wind, Grid), and cost.") | |
| st.subheader("๐ฅ Healthcare") | |
| st.write("Generate 1000 patient records with age, gender, blood pressure, cholesterol level, and diagnosis (Healthy, At Risk, Critical).") | |
| st.write("Create 500 rows of hospital data with department, number of patients, average treatment cost, and satisfaction rating (1-5).") | |
| st.write("Generate 300 rows of clinical trial data with participant ID, age, treatment type, side effects (Yes/No), and outcome (Improved/Unchanged/Worsened).") | |
| st.subheader("๐ Transportation") | |
| st.write("Generate 1000 rows of vehicle data with make, model, year, fuel efficiency (mpg), and price.") | |
| st.write("Create 500 rows of traffic data with date, time, location, number of vehicles, and average speed.") | |
| st.write("Generate 300 rows of ride-sharing data with driver ID, trip distance, trip duration, fare amount, and rating (1-5).") | |
| st.subheader("๐ Retail & E-commerce") | |
| st.write("Generate 1000 rows of customer purchase data with customer ID, product category, purchase amount, and payment method (Credit Card, PayPal, Cash).") | |
| st.write("Create 500 rows of inventory data with product ID, category, stock level, reorder point, and supplier.") | |
| st.write("Generate 300 rows of website analytics data with date, page views, unique visitors, bounce rate, and conversion rate.") | |
| st.subheader("๐๏ธ Construction & Real Estate") | |
| st.write("Generate 500 rows of real estate data with property type, location, size (sq ft), price, and status (Available/Sold).") | |
| st.write("Create 300 rows of construction project data with project ID, start date, end date, budget, and completion status (On Track/Delayed).") | |
| st.write("Generate 200 rows of rental data with property type, monthly rent, tenant age, and lease duration (months).") | |
| st.subheader("๐ฎ Gaming & Entertainment") | |
| st.write("Generate 1000 rows of gaming data with player ID, game title, hours played, in-game purchases, and player rank.") | |
| st.write("Create 500 rows of movie data with title, genre, release year, box office revenue, and IMDb rating.") | |
| st.write("Generate 300 rows of music streaming data with user ID, song title, artist, play count, and duration (minutes).") | |
| with tab3: | |
| st.header("Chat with Dataset") | |
| uploaded_file = st.file_uploader("Upload CSV for Chatting", type="csv") | |
| if uploaded_file: | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| st.success("File uploaded successfully!") | |
| chat_with_dataset(df) | |
| except Exception as e: | |
| st.error(f"Error loading CSV file: {str(e)}") | |
| else: | |
| st.info("Upload a CSV file to start chatting.") | |
| # NLP Pipeline Tab | |
| with tab4: | |
| nlp_pipeline_tab() | |
| # Line ~1100: Footer | |
| add_footer() | |
| # Sidebar for data processing and visualization | |
| add_sidebar() | |
| feature_options = st.sidebar.radio("Select Option", ["Dataset Overview", "Clean Data", "Detect Outlier", "Encoder", | |
| "Data Transformer", "Data Analysis", "Feature Importance Analyzer", | |
| "Best Parameter Selector", "Train The Dataset", "Clear Modified Dataset", | |
| "Visualizations"]) | |
| if 'uploaded_df' in st.session_state: | |
| df = st.session_state['uploaded_df'] | |
| try: | |
| if feature_options == "Dataset Overview": | |
| dataset_overview(df) | |
| elif feature_options == "Clean Data": | |
| st.session_state['uploaded_df'] = clean_data(df) | |
| elif feature_options == "Detect Outlier": | |
| detect_outlier(df) | |
| elif feature_options == "Encoder": | |
| st.session_state['uploaded_df'] = encoder(df) | |
| elif feature_options == "Data Transformer": | |
| st.session_state['uploaded_df'] = data_transformer(df) | |
| elif feature_options == "Data Analysis": | |
| data_analysis(df) | |
| elif feature_options == "Feature Importance Analyzer": | |
| feature_importance_analyzer(df) | |
| elif feature_options == "Best Parameter Selector": | |
| best_parameter_selector(df) | |
| elif feature_options == "Train The Dataset": | |
| select_ml_models(df) | |
| elif feature_options == "Clear Modified Dataset": | |
| clear_modified_dataset() | |
| elif feature_options == "Visualizations": | |
| visualize_dataset(df) | |
| features = st.sidebar.multiselect("Select features for specific visualizations", df.columns.tolist()) | |
| if features: | |
| visualize_specific_features(df, features) | |
| elif feature_options == "NLP Pipeline": | |
| nlp_pipeline_tab() | |
| if 'uploaded_df' in st.session_state: | |
| df = st.session_state['uploaded_df'] | |
| except Exception as e: | |
| st.error(f"Error processing dataset: {str(e)}") | |
| else: | |
| st.sidebar.info("Upload a CSV to proceed.") |