Spaces:

kashh65
/

AutoML

Sleeping

App Files Files Community

akash commited on Mar 31, 2025

Commit

890025a

1 Parent(s): 65eae8a

all files

Browse files

Files changed (24) hide show

app.py +168 -0
example.env +13 -0
header.svg +38 -0
laptop_data.csv +0 -0
requirements.txt +18 -0
src/__init__.py +6 -0
src/preprocessing/__init__.py +4 -0
src/preprocessing/clean_data.py +268 -0
src/preprocessing/clean_df_fallback.py +143 -0
src/training/__init__.py +7 -0
src/training/hyperparametrs.py +107 -0
src/training/model_training.py +93 -0
src/training/test_result.py +27 -0
src/training/train.py +140 -0
src/ui/__init__.py +20 -0
src/ui/css.py +377 -0
src/ui/footer.py +10 -0
src/ui/insight.py +30 -0
src/ui/loading.py +119 -0
src/ui/overview.py +47 -0
src/ui/test_results.py +295 -0
src/ui/visualization.py +556 -0
src/ui/welcome.py +186 -0
src/utils/logging.py +70 -0

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import streamlit as st
+import sys
+import os
+import pandas as pd
+import time
+# Streamlit page setup
+st.set_page_config(
+    page_title="AutoML",
+    page_icon="🛸",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={"Get Help": None, "Report a bug": None, "About": None},
+)
+# Add project root and src to Python path
+sys.path.extend([
+    os.path.dirname(os.path.abspath(__file__)),  # Project root
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "src")
+])
+# Import loading FIRST before any components
+from src.ui.loading import show_loading_state
+# Import CSS loader FIRST
+from src.ui.css import load_css
+# Load CSS immediately after imports
+load_css()
+# Cached resource loading with TTL to refresh components periodically
+@st.cache_resource(ttl=3600)  # Cache for 1 hour
+def load_components():
+    """Cache component imports to avoid reloading on every rerun"""
+    from src import (
+        show_footer,
+        visualize_data,
+        show_welcome_page,
+        show_overview_page,
+        clean_csv,
+        model_training_tab,
+        display_ai_insights,
+        display_model_evaluation
+    )
+    return (show_footer, visualize_data,
+            show_welcome_page, show_overview_page, clean_csv,
+            model_training_tab, display_ai_insights, display_model_evaluation)
+# Cached header rendering
+@st.cache_data(ttl=86400)  # Cache for 24 hours
+def render_header():
+    """Cache static header HTML"""
+    return """
+    <div class='app-header' style='padding: 1rem 0; margin-bottom: 2rem; text-align: center;'>
+        <h1 class='app-title' style='margin: 0;'>AutoML</h1>
+        <p class='app-tagline' style='margin-top: 0;'>Automated Machine Learning Made Simple.</p>
+    </div>
+    """
+# Cached data loading
+@st.cache_data(ttl=3600)  # Cache for 1 hour
+def load_default_data():
+    """Load and cache the default dataset"""
+    try:
+        return pd.read_csv("laptop_data.csv")
+    except Exception as e:
+        st.error(f"❌ Error loading default dataset: {str(e)}")
+        return None
+# Performance monitoring decorator
+def measure_time(func):
+    """Decorator to measure execution time of functions"""
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        execution_time = end_time - start_time
+        if execution_time > 1.0:  # Only log slow operations
+            print(f"⏱️ {func.__name__} took {execution_time:.2f} seconds to execute")
+        return result
+    return wrapper
+@measure_time
+def main():
+    """Optimized main function for Streamlit AutoML app"""
+    # First show loading screen before anything else
+    if "initialized" not in st.session_state:
+        # Show loading animation in full screen mode
+        with st.container():
+            show_loading_state()
+            # Force render loading screen first
+            st.empty().markdown("<style>#root > div:nth-child(1) > div > div > div > div > section > div {padding: 0rem;}</style>", unsafe_allow_html=True)
+            # Now load components in background
+            components = load_components()
+            (show_footer, visualize_data,
+             show_welcome_page, show_overview_page, clean_csv,
+             model_training_tab, display_ai_insights, display_model_evaluation) = components
+            try:
+                # Load and clean data with caching
+                default_df = load_default_data()
+                if default_df is not None:
+                    cleaned_df, insights = clean_csv(default_df)
+                    # Store everything in session state
+                    st.session_state.update({
+                        "df": cleaned_df,
+                        "insights": insights,
+                        "components": components,
+                        "initialized": True,
+                        "current_tab_index": 0  # Use consistent naming for tab tracking
+                    })
+                    # Rerun to hide loading screen
+                    st.rerun()
+                else:
+                    st.error("❌ Failed to load default dataset")
+                    return
+            except Exception as e:
+                st.error(f"❌ Error during initialization: {str(e)}")
+                return
+    # After initialization, show main interface
+    if "initialized" in st.session_state:
+        components = st.session_state.components
+        (show_footer, visualize_data,
+         show_welcome_page, show_overview_page, clean_csv,
+         model_training_tab, display_ai_insights, display_model_evaluation) = components
+        # Render main interface
+        st.markdown(render_header(), unsafe_allow_html=True)
+        # Create tabs with tab names as constants to avoid recreation
+        TAB_NAMES = ["👋 Welcome", "📊 Overview", "📈 Visualization",
+                    "🤖 Model Training", "💡 Insights", "📊 Test Results"]
+        # Initialize current tab index if not present
+        if "current_tab_index" not in st.session_state:
+            st.session_state.current_tab_index = 0
+        # Create tabs and get the current tab index
+        tab_index = st.tabs(TAB_NAMES)
+        # Display content in all tabs
+        with tab_index[0]:
+            show_welcome_page()
+        with tab_index[1]:
+            show_overview_page()
+        with tab_index[2]:
+            visualize_data(st.session_state.df)
+        with tab_index[3]:
+            model_training_tab(st.session_state.df)
+        with tab_index[4]:
+            display_ai_insights()
+        with tab_index[5]:
+            display_model_evaluation()
+        show_footer()
+if __name__ == "__main__":
+    main()

example.env ADDED Viewed

	@@ -0,0 +1,13 @@

+# AutoML Environment Variables
+# API Keys for LLM Services
+GROQ_API_KEY=your_groq_api_key_here
+GEMINI_API_KEY=your_gemini_api_key_here
+# LangSmith Tracking (Optional)
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_API_KEY=your_langchain_api_key_here
+LANGCHAIN_PROJECT=automl-project
+# Optional: Logging Configuration
+LOG_LEVEL=INFO

header.svg ADDED Viewed

laptop_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+streamlit>=1.29.0
+pandas>=2.0.0
+numpy>=1.24.0
+scikit-learn>=1.2.0
+matplotlib>=3.7.0
+plotly>=5.14.0
+seaborn>=0.12.0
+langchain>=0.0.267
+langchain-groq>=0.0.1
+langchain-google-genai>=0.0.3
+python-dotenv>=1.0.0
+scipy>=1.10.0
+joblib>=1.2.0
+pydantic>=2.0.0
+requests>=2.28.0
+pillow>=9.0.0
+altair>=4.2.0
+beautifulsoup4>=4.11.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .ui import *
+from .training import *
+from .preprocessing import *
+from .utils import *

src/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .clean_data import clean_csv
+from .clean_df_fallback import  clean_dataframe_fallback
+__all__ = ['clean_csv' , 'clean_dataframe_fallback']

src/preprocessing/clean_data.py ADDED Viewed

	@@ -0,0 +1,268 @@

+from sklearn.impute import SimpleImputer
+from dotenv import load_dotenv
+from scipy import stats
+from langchain_groq import ChatGroq
+from langchain.chains import LLMChain
+import pandas as pd
+import numpy as np
+import re
+import os
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.prompts import PromptTemplate
+from langchain_core.runnables import RunnableSequence
+import streamlit as st
+from .clean_df_fallback  import clean_dataframe_fallback
+# # Load environment variables
+load_dotenv()
+groq_api_key = os.getenv("GROQ_API_KEY")
+gemini_api_key = os.getenv("GEMINI_API_KEY")
+if not gemini_api_key:
+    raise ValueError("GEMINI_API_KEY not found in environment variables")
+if not groq_api_key:
+    raise ValueError("GROQ_API_KEY not found in environment variables")
+# Initialize the LLM model
+try:
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-2.0-flash-lite-preview-02-05",
+        google_api_key=gemini_api_key
+    )
+    print("Primary Gemini LLM loaded successfully.")
+except Exception as e:
+    print(f"Error initializing primary Gemini LLM: {e}")
+    # Fallback to a different LLM from Groq
+    try:
+        llm = ChatGroq(
+            model="gemma2-9b-it",  # replace with your desired Groq model identifier
+            groq_api_key=groq_api_key
+        )
+        print("Fallback Groq LLM loaded successfully.")
+    except Exception as e2:
+        print(f"Error initializing fallback Groq LLM: {e2}")
+        llm=None
+# Cache the clean_csv function to prevent redundant cleaning
+@st.cache_data(ttl=3600, show_spinner=False)
+def cached_clean_csv(df_json, skip_cleaning=False):
+    """Cached version of the clean_csv function to prevent redundant cleaning.
+    Args:
+        df_json: JSON string representation of the dataframe (for hashing)
+        skip_cleaning: Whether to skip cleaning
+    Returns:
+        Tuple of (cleaned_df, insights)
+    """
+    # Convert JSON back to dataframe
+    df = pd.read_json(df_json, orient='records')
+    # If skip_cleaning is True, return the dataframe as is
+    if skip_cleaning:
+        return df, "No cleaning performed (user skipped)."
+    # Reset any test results if we're cleaning a new dataset
+    if "test_results_calculated" in st.session_state:
+        st.session_state.test_results_calculated = False
+        # Clear any previous test metrics to avoid using stale data
+        for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']:
+            if key in st.session_state:
+                del st.session_state[key]
+    # Call the actual cleaning function
+    return clean_csv(df)
+def clean_csv(df):
+    """Original clean_csv function that performs the actual cleaning."""
+    # ---------------------------
+    # Early fallback if LLM initialization failed
+    # ---------------------------
+    if llm is None:
+        print("LLM initialization failed; using hardcoded cleaning function.")
+        fallback_df = clean_dataframe_fallback(df)
+        return fallback_df , "LLM initialization failed; using hardcoded cleaning function, so no insights were generated."
+    # ---------------------------
+    # LLM-based cleaning function generation
+    # ---------------------------
+    # Escape curly braces in the JSON sample and column names
+    sample_data = df.head(3).to_json(orient='records')
+    escaped_sample_data = sample_data.replace("{", "{{").replace("}", "}}")
+    escaped_columns = [
+        col.replace("{", "{{").replace("}", "}}") for col in df.columns
+    ]
+    column_names_str = ", ".join(escaped_columns)
+    # Define the prompt for generating the cleaning function
+    initial_prompt = PromptTemplate.from_template(f'''
+            You are given the following sample data from a pandas DataFrame:
+                {escaped_sample_data}
+               column names are : [{column_names_str}].
+                 Generate a Python function named clean_dataframe(df) considering the following:
+                1. Performs thorough data cleaning without performing feature engineering. Ensure all necessary cleaning steps are included.
+                2. Uses assignment operations (e.g., df = df.drop(...)) and avoids inplace=True for clarity.
+                3. First deeply analyze each column’s content this is the most important step , to infer its predominant data type for example if we have RS.2100 in rows remove rs and if we have (89%) remove %  , if the column contains only text and no numbers then it is a text column and if it contains numbers and text then it is a mixed column and if it contains only numbers then it is a numeric column.
+                4. For columns that are intended to be numeric but contain extra characters (such as '%' in percentage values, currency symbols like 'Rs.', '$', and commas), remove all non-digit characters (except for the decimal point) and convert them to a numeric type.
+                5. For columns that are clearly text or categorical, preserve the content without removing digits or altering the textual information.
+                6. Handles missing values appropriately: fill numeric columns with the median (or 0 if the median is not available) and non-numeric columns with 'Unknown'.
+                7. For columns where more than 50% of values are strings and less than 10% are numeric, perform conservative string cleaning by removing unwanted special symbols while preserving meaningful digits.
+                8. For columns whose names contain 'name', 'Name', or 'Names' (case-insensitive), convert to string type and remove extraneous numeric characters only if they are not part of the essential text.
+                9. Preserves other categorical or text columns (such as Gender, City, State, Country, etc.) unless explicitly specified for removal.
+                10. Handles edge cases such as completely empty columns appropriately.
+                Return only the Python code for the function, with no explanations or extra formatting.
+               '''
+        )
+        # Define the refinement prompt
+    refine_prompt = PromptTemplate.from_template(
+            "The following Python code for cleaning a DataFrame caused an error: {error}\n"
+            "Original code:\n{code}\n"
+            "Please correct the code to fix the error and ensure it returns a cleaned DataFrame. "
+            "Return only the corrected Python code for the function, no explanations or formatting."
+        )
+        # Create the chains using modern LangChain approach
+    initial_chain = initial_prompt | llm
+    refine_chain = refine_prompt | llm
+    def extract_code(response):
+            if isinstance(response, str):
+                # Handle Markdown or plain text
+                if "```python" in response:
+                    match = re.search(r'```python\n(.*?)\n```', response, re.DOTALL)
+                    return match.group(1).strip() if match else response
+                elif "```" in response:
+                    match = re.search(r'```\n(.*?)\n```', response, re.DOTALL)
+                    return match.group(1).strip() if match else response
+                return response.strip()
+            # Handle LLM response objects
+            content = getattr(response, 'content', str(response))
+            if "```python" in content:
+                match = re.search(r'```python\n(.*?)\n```', content, re.DOTALL)
+                return match.group(1).strip() if match else content
+            elif "```" in content:
+                match = re.search(r'```\n(.*?)\n```', content, re.DOTALL)
+                return match.group(1).strip() if match else content
+            return content.strip()
+    try:
+        # Generate initial chain and extract the cleaned code
+        cleaning_function_code = extract_code(initial_chain.invoke({}))
+        print("Initial generated cleaning function code not executed yet is:\n", cleaning_function_code)
+    # Iterative refinement loop with max 5 attempts
+        max_attempts = 5
+        for attempt in range(max_attempts):
+            print(f"Attempt {attempt} code:\n{cleaning_function_code}")  # <-- HERE
+            try:
+                # Execute the code in global namespace
+                exec(cleaning_function_code, globals())
+                # Call the function and assign the result back to df
+                if 'clean_dataframe' not in globals():
+                    raise NameError("Cleaning function not defined in generated code")
+                df = clean_dataframe(df)
+                print(f"Cleaning successful on attempt {attempt + 1}")
+                break
+            # if the cleaning fails
+            except Exception as e:
+                error_message = str(e)
+                print(f"Error on attempt {attempt + 1}: {error_message}")
+            if attempt < max_attempts - 1:
+                # Refine the code using the error message if there are still epochs left
+                refined_response = refine_chain.invoke({"error": error_message, "code": cleaning_function_code})
+                cleaning_function_code = extract_code(refined_response)
+                print(f"Refined cleaning function code:\n", cleaning_function_code)
+            else:
+                print("Failed to clean DataFrame after 5 maximum attempts")
+                # AFter all the failed attempt using the hardcoded logic
+                df = clean_dataframe_fallback(df)
+    except Exception as e:
+        print("⚡No successful cleaning done enforcing fallback")
+        df = clean_dataframe_fallback(df)
+    cleaned_df = df
+    insights_prompt = f"""
+    Analyze this cleaned dataset:
+    - Columns: {cleaned_df.columns.tolist()}
+    - Sample data: {cleaned_df.head(3).to_dict()}
+    - Numeric stats: {cleaned_df.describe().to_dict()}
+    Provide key data quality insights and recommendations.
+    """
+    try:
+        insights_response = llm.invoke(insights_prompt)
+        analysis_insights = insights_response.content
+    except Exception as e:
+        analysis_insights = f"Insight generation failed: {str(e)}"
+    # Return the cleaned DataFrame and dummy insights
+    return cleaned_df, analysis_insights

src/preprocessing/clean_df_fallback.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import re
+import pandas as pd
+import numpy as np
+import streamlit as st
+# Define fallback cleaning function
+@st.cache_data
+def clean_dataframe_fallback(df):
+        """Hardcoded data cleaning pipeline"""
+        """Generic data cleaning pipeline with categorical preservation"""
+        df_cleaned = df.copy()
+        df_cleaned = df_cleaned.applymap(
+        lambda x: re.sub(r"\(.*?\)", "", str(x)) if isinstance(x, str) else x)
+        # Remove 'ref.' references
+        df_cleaned = df_cleaned.applymap(
+            lambda x: re.sub(r"ref\.", "", str(x), flags=re.IGNORECASE) if isinstance(x, str) else x)
+        # Remove any other special characters except letters, digits, spaces, and dots
+        df_cleaned = df_cleaned.applymap(
+            lambda x: re.sub(r"[^\w\s\d\.]", "", str(x)).strip() if isinstance(x, str) else x
+        )
+        # Step 0 - Clean column names first
+        df_cleaned.columns = [col.strip().lower().replace(' ', '_') for col in df_cleaned.columns]
+            # Define measurement units to remove
+        measurement_units = {
+            'weight': r'\s*(kg|kilograms|lbs|pounds)$',
+            'height': r'\s*(cm|centimeters|inches|feet|ft)$'
+        }
+        # Step 1 - Remove redundant columns
+        # Preservation patterns for categorical columns
+        preserve_pattern = re.compile(r'(name|brand|model|type|category|region|text|desc|color|size)', re.IGNORECASE)
+        preserved_cols = [col for col in df_cleaned.columns if preserve_pattern.search(col)]
+        # ID pattern detection
+        id_pattern = re.compile(r'(_id|id_|num|no|number|identifier|code|idx|row)', re.IGNORECASE)
+        id_cols = [col for col in df_cleaned.columns if id_pattern.search(col) and col not in preserved_cols]
+        # Unique value columns
+        unique_cols = [col for col in df_cleaned.columns
+                    if df_cleaned[col].nunique() == len(df_cleaned)
+                    and col not in preserved_cols]
+        redundant_cols = list(set(id_cols + unique_cols))
+        df_cleaned = df_cleaned.drop(columns=redundant_cols)
+        print(f"Removed {len(redundant_cols)} redundant columns: {redundant_cols}")
+        # Step 2 - Enhanced numeric detection with categorical protection
+        for col in df_cleaned.columns:
+            if col in preserved_cols:
+                print(f"Preserving categorical column: {col}")
+                continue  # Skip preserved columns
+            if any(unit in col for unit in measurement_units.keys()):
+                pattern = measurement_units.get(col.split('_')[0], r'')
+                df_cleaned[col] = df_cleaned[col].astype(str).str.replace(pattern, '', regex=True).str.strip()
+            if pd.api.types.is_numeric_dtype(df_cleaned[col]):
+                continue
+            # Strict numeric pattern detection
+            non_null_count = df_cleaned[col].dropna().shape[0]
+            sample_size = min(100, non_null_count)
+            sample = df_cleaned[col].dropna().sample(sample_size, random_state=42)
+            numeric_pattern = r'^[-+]?\d*\.?\d+$'  # Full string match
+            num_matches = sample.astype(str).str.fullmatch(numeric_pattern).mean()
+            if num_matches > 0.8:  # High threshold
+                # Conservative cleaning
+                cleaned = df_cleaned[col].replace(r'[^\d\.\-]', '', regex=True)
+                converted = pd.to_numeric(cleaned, errors='coerce')
+                success_rate = converted.notna().mean()
+                if success_rate > 0.9:  # Strict success requirement
+                    df_cleaned[col] = converted
+                    print(f"Converted {col} to numeric (success: {success_rate:.1%})")
+        # Step 3 - Date detection
+        date_cols = []
+        for col in df_cleaned.select_dtypes(exclude=np.number).columns:
+            if col in preserved_cols:
+                continue
+            try:
+                df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='raise')
+                date_cols.append(col)
+                print(f"Detected datetime: {col}")
+            except:
+                pass
+# Example manual approach:
+        currency_cols = [col for col in df_cleaned.columns if any(keyword in col.lower() for keyword in ["price", "gross", "budget"])]
+        for col in currency_cols:
+            df_cleaned[col] = df_cleaned[col].astype(str).str.replace(r'[^\d\.]', '', regex=True)  # remove everything except digits & dots
+            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
+        # Step 4 - Missing value handling
+        numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
+        categorical_cols = df_cleaned.select_dtypes(exclude=np.number).columns
+        # Numeric imputation
+        for col in numeric_cols:
+            if df_cleaned[col].isna().any():
+                df_cleaned[f'{col}_missing'] = df_cleaned[col].isna().astype(int)
+                df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
+        # Categorical imputation
+        for col in categorical_cols:
+            if df_cleaned[col].isna().any():
+                mode_val = df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown'
+                df_cleaned[col] = df_cleaned[col].fillna(mode_val)
+        # Step 5 - Text normalization for non-preserved columns
+        text_cols = [col for col in categorical_cols if col not in preserved_cols]
+        for col in text_cols:
+            df_cleaned[col] = df_cleaned[col].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x)).strip().lower())
+        # Step 6 - Outlier handling (preserve categoricals)
+        numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
+        for col in numeric_cols:
+            if df_cleaned[col].nunique() > 10:
+                q1 = df_cleaned[col].quantile(0.05)
+                q3 = df_cleaned[col].quantile(0.95)
+                df_cleaned[col] = np.clip(df_cleaned[col], q1, q3)
+        # Step 7 - Final validation
+        df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)
+        return df_cleaned

src/training/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .train import train_model
+from .hyperparametrs import get_hyperparams_ui
+from .model_training import model_training_tab
+from .test_result import display_model_evaluation
+__all__ = ["train_model" , "get_hyperparams_ui", "model_training_tab" , "display_model_evaluation"]

src/training/hyperparametrs.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+#  Define Hyperparameter Options
+def get_hyperparams_ui(model_name):
+            """Generate UI components for model-specific hyperparameters."""
+            hyperparams = {}
+            if model_name in ["Random Forest Regressor", "Random Forest"]:
+                hyperparams["n_estimators"] = st.number_input("Number of Trees (n_estimators)", min_value=10, max_value=500, value=100)
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=50, value=10)
+                hyperparams["min_samples_split"] = st.number_input("Min Samples Split", min_value=2, max_value=10, value=2)
+            elif model_name in ["XGBoost Regressor", "XGBoost"]:
+                hyperparams["n_estimators"] = st.number_input("Number of Boosting Rounds (n_estimators)", min_value=10, max_value=500, value=100)
+                hyperparams["learning_rate"] = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=50, value=6)
+            elif model_name == "Linear Regression":
+                st.info("No hyperparameters required for Linear Regression.")
+            # New Regression Models:
+            elif model_name == "Polynomial Regression":
+                hyperparams["degree"] = st.number_input("Degree of Polynomial Features", min_value=2, max_value=10, value=2)
+                # You may add additional hyperparameters for the underlying LinearRegression if needed
+            elif model_name == "Ridge Regression":
+                hyperparams["alpha"] = st.slider("Regularization Strength (alpha)", 0.01, 10.0, 1.0)
+                hyperparams["solver"] = st.selectbox("Solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"])
+            elif model_name == "Lasso Regression":
+                hyperparams["alpha"] = st.slider("Regularization Strength (alpha)", 0.01, 10.0, 1.0)
+                hyperparams["max_iter"] = st.number_input("Max Iterations", min_value=100, max_value=1000, value=1000)
+            elif model_name == "Logistic Regression":
+                hyperparams["C"] = st.slider("Regularization Strength (C)", 0.01, 10.0, 1.0)
+                hyperparams["max_iter"] = st.number_input("Max Iterations", min_value=100, max_value=1000, value=200)
+            elif model_name == "Support Vector Regressor":
+                hyperparams["C"] = st.slider("Regularization parameter (C)", 0.1, 100.0, 1.0)
+                hyperparams["epsilon"] = st.slider("Epsilon", 0.0, 1.0, 0.1)
+                hyperparams["kernel"] = st.selectbox("Kernel", ["linear", "rbf", "poly", "sigmoid"])
+            elif model_name == "Decision Tree Regressor":
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=50, value=10)
+                hyperparams["min_samples_split"] = st.number_input("Min Samples Split", min_value=2, max_value=10, value=2)
+            elif model_name == "K-Nearest Neighbors Regressor":
+                hyperparams["n_neighbors"] = st.number_input("Number of Neighbors", min_value=1, max_value=100, value=5)
+                hyperparams["weights"] = st.selectbox("Weight Function", ["uniform", "distance"])
+            elif model_name == "ElasticNet":
+                hyperparams["alpha"] = st.slider("Alpha", 0.01, 10.0, 1.0)
+                hyperparams["l1_ratio"] = st.slider("L1 Ratio", 0.0, 1.0, 0.5)
+            elif model_name == "Gradient Boosting Regressor":
+                hyperparams["n_estimators"] = st.number_input("Number of Estimators", min_value=10, max_value=500, value=100)
+                hyperparams["learning_rate"] = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=20, value=3)
+            elif model_name == "AdaBoost Regressor":
+                hyperparams["n_estimators"] = st.number_input("Number of Estimators", min_value=10, max_value=500, value=50)
+                hyperparams["learning_rate"] = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+            elif model_name == "Bayesian Ridge":
+                hyperparams["alpha_1"] = st.slider("Alpha 1", 1e-6, 1e-1, 1e-4, format="%.6f")
+                hyperparams["alpha_2"] = st.slider("Alpha 2", 1e-6, 1e-1, 1e-4, format="%.6f")
+                hyperparams["lambda_1"] = st.slider("Lambda 1", 1e-6, 1e-1, 1e-4, format="%.6f")
+                hyperparams["lambda_2"] = st.slider("Lambda 2", 1e-6, 1e-1, 1e-4, format="%.6f")
+            # --- Additional Classification Models ---
+            elif model_name == "Support Vector Classifier":
+                hyperparams["C"] = st.slider("Regularization parameter (C)", 0.1, 100.0, 1.0)
+                hyperparams["kernel"] = st.selectbox("Kernel", ["linear", "rbf", "poly", "sigmoid"])
+            elif model_name == "Decision Tree Classifier":
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=50, value=10)
+                hyperparams["min_samples_split"] = st.number_input("Min Samples Split", min_value=2, max_value=10, value=2)
+            elif model_name == "K-Nearest Neighbors Classifier":
+                hyperparams["n_neighbors"] = st.number_input("Number of Neighbors", min_value=1, max_value=100, value=5)
+                hyperparams["weights"] = st.selectbox("Weight Function", ["uniform", "distance"])
+            elif model_name == "Gradient Boosting Classifier":
+                hyperparams["n_estimators"] = st.number_input("Number of Estimators", min_value=10, max_value=500, value=100)
+                hyperparams["learning_rate"] = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+                hyperparams["max_depth"] = st.number_input("Max Depth", min_value=1, max_value=20, value=3)
+            elif model_name == "AdaBoost Classifier":
+                hyperparams["n_estimators"] = st.number_input("Number of Estimators", min_value=10, max_value=500, value=50)
+                hyperparams["learning_rate"] = st.slider("Learning Rate", 0.01, 1.0, 0.1)
+            elif model_name == "Gaussian Naive Bayes":
+                hyperparams["var_smoothing"] = st.slider("Var Smoothing", 1e-12, 1e-8, 1e-9, format="%.12f")
+            elif model_name == "Quadratic Discriminant Analysis":
+                hyperparams["reg_param"] = st.slider("Regularization Parameter", 0.0, 1.0, 0.0)
+            elif model_name == "Linear Discriminant Analysis":
+                hyperparams["solver"] = st.selectbox("Solver", ["svd", "lsqr", "eigen"])
+            return hyperparams

src/training/model_training.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import streamlit as st
+from .hyperparametrs import get_hyperparams_ui
+import pickle
+from .train import train_model
+#  Model Training Tab
+def model_training_tab(df):
+    # Ensure we have session state for model training
+    if "target_column" not in st.session_state:
+        st.session_state.target_column = df.columns[0] if not df.empty else None
+    if "selected_model" not in st.session_state:
+        st.session_state.selected_model = None
+    st.subheader("📌 Model Training")
+    # Use session state to maintain selection across reruns
+    target_column = st.selectbox(
+        "🎯 Select Target Column (Y)",
+        df.columns,
+        index=list(df.columns).index(st.session_state.target_column) if st.session_state.target_column in df.columns else 0,
+        key="target_column_select"
+    )
+    # Update session state after selection
+    st.session_state.target_column = target_column
+    # Infer task type automatically
+    task_type = "classification" if df[target_column].dtype == "object" or df[target_column].nunique() <= 10 else "regression"
+    st.write(f"🔍 Detected Task Type: **{task_type.capitalize()}**")
+    model_options = {
+        "classification": ["Random Forest", "Logistic Regression", "XGBoost" , "Support Vector Classifier", "Decision Tree Classifier", "K-Nearest Neighbors Classifier", "Gradient Boosting Classifier", "AdaBoost Classifier", "Gaussian Naive Bayes", "Quadratic Discriminant Analysis", "Linear Discriminant Analysis"],
+        "regression": ["Linear Regression", "Random Forest Regressor", "XGBoost Regressor" , "Support Vector Regressor", "Decision Tree Regressor", "K-Nearest Neighbors Regressor", "ElasticNet", "Gradient Boosting Regressor", "AdaBoost Regressor", "Bayesian Ridge" , "Ridge Regression", "Lasso Regression"],
+    }
+    # Initialize selected model if not already set or if task type changed
+    if st.session_state.selected_model not in model_options[task_type]:
+        st.session_state.selected_model = model_options[task_type][0]
+    # Use session state to maintain selection across reruns
+    selected_model_name = st.selectbox(
+        "🤖 Choose Model",
+        model_options[task_type],
+        index=model_options[task_type].index(st.session_state.selected_model),
+        key="selected_model_select"
+    )
+    # Update session state after selection
+    st.session_state.selected_model = selected_model_name
+    st.markdown("### 🔧 Hyperparameters")
+    hyperparams = get_hyperparams_ui(selected_model_name)
+    # Use a unique key for the button to avoid conflicts
+    if st.button("🚀 Train Model", key="train_model_button_unique"):
+        with st.spinner("Training in progress... ⏳"):
+            trained_model = train_model(df, target_column, task_type, selected_model_name, hyperparams)
+            st.success("✅ Model trained successfully!")
+            st.session_state.trained_model = trained_model
+            st.session_state.model_trained = True
+            # Note: test_results_calculated is already reset in train_model function
+    if "trained_model" in st.session_state:
+        st.markdown("### 📥 Download Trained Model")
+        # Use a safer approach for file operations with proper cleanup
+        try:
+            # Use a temporary file that will be automatically cleaned up
+            import tempfile
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pkl') as temp_file:
+                pickle.dump(st.session_state.trained_model, temp_file)
+                temp_file_path = temp_file.name
+            # Read the file for download
+            with open(temp_file_path, "rb") as f:
+                st.download_button(
+                    label="📥 Download Model",
+                    data=f,
+                    file_name="trained_model.pkl",
+                    mime="application/octet-stream",
+                )
+            # Clean up the temporary file
+            import os
+            try:
+                os.unlink(temp_file_path)
+            except:
+                pass  # Silently handle deletion errors
+        except Exception as e:
+            st.error(f"Error preparing model for download: {str(e)}")

src/training/test_result.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+from ui.test_results import display_test_results
+def display_model_evaluation():
+    """Displays the evaluation results of the trained model on the test set."""
+    st.header("📊 Model Evaluation on Test Set")
+    # Ensure model and test data exist in session state
+    if "trained_model" in st.session_state and "X_test" in st.session_state:
+        trained_model = st.session_state.trained_model
+        X_test = st.session_state.X_test
+        y_test = st.session_state.y_test
+        task_type = st.session_state.task_type
+        # Handle classification case where model may include a label encoder
+        if task_type == "classification":
+            if isinstance(trained_model, tuple):
+                pipeline, label_encoder = trained_model
+                display_test_results((pipeline, label_encoder), X_test, y_test, task_type)
+            else:
+                display_test_results(trained_model, X_test, y_test, task_type)
+        else:
+            display_test_results(trained_model, X_test, y_test, task_type)
+    else:
+        st.warning("🚨 Train a model first to see test results!")

src/training/train.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from xgboost import XGBRegressor, XGBClassifier
+from sklearn.svm import SVR, SVC
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
+from sklearn.linear_model import ElasticNet, BayesianRidge
+from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier, AdaBoostClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
+from sklearn.linear_model import Ridge, Lasso
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline as SkPipeline
+import streamlit as st
+def get_model(task_type, model_name, hyperparams):
+    """Returns the model instance based on user selection with hyperparameters."""
+    models = {
+        "regression": {
+            # Already existing:
+            "Linear Regression": LinearRegression,
+            "Random Forest Regressor": RandomForestRegressor,
+            "XGBoost Regressor": XGBRegressor,
+            # Additional regression models:
+            "Support Vector Regressor": SVR,
+            "Decision Tree Regressor": DecisionTreeRegressor,
+            "K-Nearest Neighbors Regressor": KNeighborsRegressor,
+            "ElasticNet": ElasticNet,
+            "Gradient Boosting Regressor": GradientBoostingRegressor,
+            "AdaBoost Regressor": AdaBoostRegressor,
+            "Bayesian Ridge": BayesianRidge,
+            "Ridge Regression": Ridge,
+            "Lasso Regression": Lasso ,
+        },
+        "classification": {
+            # Already existing:
+            "Logistic Regression": LogisticRegression,
+            "Random Forest": RandomForestClassifier,
+            "XGBoost": XGBClassifier,
+            # Additional classification models:
+            "Support Vector Classifier": SVC,
+            "Decision Tree Classifier": DecisionTreeClassifier,
+            "K-Nearest Neighbors Classifier": KNeighborsClassifier,
+            "Gradient Boosting Classifier": GradientBoostingClassifier,
+            "AdaBoost Classifier": AdaBoostClassifier,
+            "Gaussian Naive Bayes": GaussianNB,
+            "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis,
+            "Linear Discriminant Analysis": LinearDiscriminantAnalysis
+        }
+    }
+    if task_type in models and model_name in models[task_type]:
+        return models[task_type][model_name](**hyperparams)  # Apply hyperparameters
+    else:
+        raise ValueError(f"Invalid model selection: {model_name} for {task_type}")
+def train_model(df, target_column, task_type, selected_model_name, hyperparams):
+    """Preprocess data, train the selected model with hyperparameters, and return the trained model."""
+    with st.spinner(" Training model... Please wait!"):
+        # Get the model with hyperparameters
+        model = get_model(task_type, selected_model_name, hyperparams)
+        # Split features and target
+        X = df.drop(columns=[target_column])
+        y = df[target_column]
+        # Label encode target if classification (for categorical labels)
+        label_encoder = None
+        if task_type == "classification" and y.dtype == "object":
+            from sklearn.preprocessing import LabelEncoder
+            label_encoder = LabelEncoder()
+            y = label_encoder.fit_transform(y)
+        # Train-Test Split (80-20)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+        # Identify numerical and categorical columns
+        num_cols = X.select_dtypes(include=["int64", "float64"]).columns
+        cat_cols = X.select_dtypes(include=["object", "category"]).columns
+        # Preprocessing Pipeline
+        # Numeric pipeline: impute missing values then scale them
+        num_pipeline = SkPipeline([
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", StandardScaler())
+        ])
+        # Categorical pipeline: impute missing values then one-hot encode them
+        cat_pipeline = SkPipeline([
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
+        ])
+        preprocessor = ColumnTransformer([
+            ("num", num_pipeline, num_cols),
+            ("cat", cat_pipeline, cat_cols)
+        ])
+        pipeline = SkPipeline([
+            ("preprocessor", preprocessor),
+            ("model", model)
+        ])
+        # Train Model
+        pipeline.fit(X_train, y_train)
+        # Store test data and metadata in session state
+        st.session_state.X_test = X_test
+        st.session_state.y_test = y_test
+        st.session_state.task_type = task_type
+        st.session_state.label_encoder = label_encoder  # Store label encoder for decoding predictions
+        # Reset test results calculation flag when a new model is trained
+        if "test_results_calculated" in st.session_state:
+            st.session_state.test_results_calculated = False
+        # Clear any previous test metrics to avoid using stale data
+        for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']:
+            if key in st.session_state:
+                del st.session_state[key]
+        # Return trained model + label encoder (needed for decoding predictions if classification)
+        if task_type == "classification":
+            return pipeline, label_encoder
+        else:
+            return pipeline

src/ui/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .visualization import visualize_data
+from .css import load_css
+from .loading import show_loading_state
+from .footer import show_footer
+from .welcome import show_welcome_page
+from .test_results import display_test_results
+from .overview import show_overview_page
+from .insight import display_ai_insights
+__all__ = [
+    "load_css",
+    "show_footer",
+    "show_loading_state",
+    "show_welcome_page",
+    "visualize_data",
+    "display_test_results",
+    "show_overview_page" ,
+    "display_ai_insights"
+]

src/ui/css.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# src/css.py
+import streamlit as st
+def load_css():
+    css = """
+        <style>
+        /* --- EXPLICIT COLOR DEFINITIONS --- */
+        :root {
+            --dark-bg: #0E1117;
+            --light-text: #FAFAFA;
+            --neon-green: #00FFA3;
+            --neon-font: Arial, Helvetica, sans-serif;
+            --tab-font-size: 18px;
+            --tab-bottom-border-height: 4px;
+            /* Component Styling Variables */
+            --card-bg: rgba(30, 30, 40, 0.5);
+            --card-border: rgba(0, 255, 163, 0.25);
+            --expander-header-bg: rgba(40, 40, 55, 0.6);
+            --expander-hover-bg: rgba(0, 255, 163, 0.1);
+        }
+        /* --- Base Styles --- */
+        body {
+            background-color: var(--dark-bg) !important;
+            color: var(--light-text) !important;
+        }
+        .stApp {
+             background-color: var(--dark-bg) !important;
+        }
+        /* --- App Header Styling --- */
+        .app-title {
+            color: var(--neon-green) !important;
+            font-family: var(--neon-font);
+            font-size: 80px !important;
+            font-weight: 700;
+            text-shadow:
+                0 0 1px var(--neon-green),
+                0 0 2px var(--neon-green),
+                0 0 5px var(--neon-green),
+                0 0 45px var(--neon-green);
+            margin-bottom: 25px !important;
+            line-height: 1.0 !important;
+            text-align: center !important;
+        }
+        .app-tagline {
+            color: var(--neon-green) !important;
+            font-family: var(--neon-font);
+            font-size: 27px !important;
+            font-style: normal !important;
+            font-weight: 400 !important;
+            text-shadow:
+                0 0 5px var(--neon-green),
+                0 0 10px var(--neon-green);
+            margin-top: 10px !important;
+            text-align: center !important;
+        }
+        .app-header {
+             padding: 1rem 0 !important;
+             margin-bottom: 2rem !important;
+             text-align: center !important;
+        }
+        /* --- End App Header Styling --- */
+        /* --- Tab Styling --- */
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"],
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"] > div,
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"] > div > span {
+            font-size: var(--tab-font-size) !important;
+            font-family: var(--neon-font) !important;
+            font-weight: 600 !important;
+        }
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"][aria-selected="true"],
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"][aria-selected="true"] > div,
+        div[data-baseweb="tab-list"] button[data-baseweb="tab"][aria-selected="true"] > div > span {
+            font-size: var(--tab-font-size) !important;
+        }
+        div[data-baseweb="tab-list"] {
+            border: none !important; border-top: none !important; border-right: none !important; border-left: none !important; border-bottom: none !important;
+            border-color: transparent !important; outline: none !important; box-shadow: none !important;
+            margin-bottom: 25px !important; padding: 0 !important;
+            display: flex !important; justify-content: space-around !important;
+        }
+        button[data-baseweb="tab"] {
+            color: var(--light-text) !important; padding: 1rem 1.5rem !important;
+            transition: color 0.3s ease, text-shadow 0.3s ease, border-bottom-color 0.3s ease !important;
+            border-style: solid !important; border-width: 0 0 var(--tab-bottom-border-height) 0 !important;
+            border-color: transparent transparent transparent transparent !important;
+            outline: none !important; box-shadow: none !important; background-color: transparent !important;
+            margin: 0 !important; line-height: normal !important; flex-shrink: 0 !important;
+        }
+        button[data-baseweb="tab"]::before, button[data-baseweb="tab"]::after { display: none !important; content: none !important; }
+        button[data-baseweb="tab"]:hover:not([aria-selected="true"]) {
+            color: var(--neon-green) !important; background-color: transparent !important;
+            border-color: transparent transparent transparent transparent !important; outline: none !important; box-shadow: none !important;
+            text-shadow: 0 0 3px var(--neon-green), 0 0 6px var(--neon-green);
+        }
+        button[data-baseweb="tab"][aria-selected="true"] {
+            border: none !important; border-top: none !important; border-right: none !important; border-left: none !important; border-bottom: none !important;
+            border-color: transparent !important; outline: none !important; box-shadow: none !important; background-color: transparent !important;
+            color: var(--neon-green) !important; border-bottom-style: solid !important;
+            border-bottom-width: var(--tab-bottom-border-height) !important; border-bottom-color: var(--neon-green) !important;
+            text-shadow: 0 0 3px var(--neon-green), 0 0 6px var(--neon-green);
+        }
+        /* --- End Tab Styling --- */
+        /* --- Welcome Page Specific Styling --- */
+        /* Main Welcome Header (H1) - *** UPDATED COLOR *** */
+        .welcome-header h1 {
+            font-size: 2.8rem !important;
+            font-weight: 700 !important;
+            margin-bottom: 0.5rem !important;
+            color: var(--neon-green) !important; /* Use neon green */
+            text-align: left !important;
+            border-bottom: none !important;
+            /* Optional: Add a subtle glow like the main title */
+            text-shadow: 0 0 4px rgba(0, 255, 163, 0.7);
+        }
+        /* Main Welcome Subtitle (P) */
+         .welcome-header p.subtitle {
+            font-size: 1.15rem !important;
+            color: var(--subtitle-text) !important; /* Keep subtitle gray */
+            margin-bottom: 0 !important;
+            text-align: left !important;
+         }
+        /* Section Headers (H2 generated by st.markdown("## ...")) */
+        .stApp h2 {
+            font-size: 1.9rem !important;
+            font-weight: 600 !important;
+            color: var(--neon-green) !important;
+            border-bottom: 1px solid rgba(0, 255, 163, 0.3);
+            padding-bottom: 8px !important;
+            margin-top: 40px !important;
+            margin-bottom: 25px !important;
+        }
+        /* Override for Sidebar Title H2 */
+        section[data-testid="stSidebar"] h2 {
+             border-bottom: none !important;
+             color: #E6E6FA !important;
+             font-size: 1.8rem !important;
+             font-weight: 600 !important;
+             margin: 0 !important;
+             padding-bottom: 0 !important;
+        }
+        /* Feature Card Styling */
+        .feature-card {
+            background-color: var(--card-bg);
+            border: 1px solid var(--card-border);
+            border-radius: 8px;
+            padding: 1.5rem 1.75rem;
+            height: 100%;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+            transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
+            margin-bottom: 1rem;
+        }
+        .feature-card:hover {
+            transform: translateY(-3px);
+            box-shadow: 0 6px 12px rgba(0, 255, 163, 0.15);
+        }
+         /* Titles within cards (H3) */
+         .feature-card h3 {
+            font-size: 1.3rem !important;
+            font-weight: 600 !important;
+            color: var(--light-text) !important;
+            margin-top: 0 !important;
+            margin-bottom: 1rem !important;
+            border-bottom: none !important;
+            padding-bottom: 0 !important;
+         }
+         /* Lists within cards */
+         .feature-card ul {
+            padding-left: 0 !important; margin-left: 5px; margin-bottom: 0 !important;
+            list-style-type: none;
+         }
+         .feature-card ul li {
+              margin-bottom: 0.6rem !important; line-height: 1.5;
+              color: var(--subtitle-text) !important; position: relative; padding-left: 1.2em;
+         }
+         .feature-card ul li::before {
+              content: '▪'; color: var(--neon-green); font-weight: bold;
+              display: inline-block; position: absolute; left: 0; top: 0;
+         }
+        /* Getting Started List (Ordered List - OL) */
+        .stApp ol {
+             padding-left: 0 !important; margin-left: 5px; margin-bottom: 30px !important;
+             list-style-type: none; counter-reset: getting-started-counter;
+        }
+         .stApp ol li {
+             margin-bottom: 12px !important; line-height: 1.6 !important;
+             color: var(--light-text) !important; counter-increment: getting-started-counter;
+             position: relative; padding-left: 2.5em;
+         }
+         .stApp ol li::before {
+              content: counter(getting-started-counter); color: var(--dark-bg); background-color: var(--neon-green);
+              font-weight: bold; border-radius: 50%; width: 1.6em; height: 1.6em; display: inline-block;
+              text-align: center; line-height: 1.6em; position: absolute; left: 0; top: 0;
+         }
+         .stApp ol li strong { color: var(--neon-green) !important; font-weight: 600; }
+        /* Expander Styling */
+        div[data-testid="stExpander"] {
+             border: none !important; border-radius: 8px !important; margin-bottom: 1rem !important;
+             box-shadow: 0 2px 4px rgba(0,0,0,0.2); background-color: transparent !important; overflow: hidden;
+        }
+        div[data-testid="stExpander"] summary {
+            padding: 0.8rem 1.2rem !important; font-size: 1.1rem !important; font-weight: 600 !important;
+            color: var(--light-text) !important; background-color: var(--expander-header-bg) !important;
+            border: none !important; border-radius: 0 !important;
+            transition: background-color 0.2s ease, color 0.2s ease; cursor: pointer;
+        }
+         div[data-testid="stExpander"] summary:hover {
+             background-color: var(--expander-hover-bg) !important; color: var(--neon-green) !important;
+         }
+         div[data-testid="stExpander"] summary svg { fill: var(--light-text) !important; }
+         div[data-testid="stExpander"] summary:hover svg { fill: var(--neon-green) !important; }
+         div[data-testid="stExpander"] div[role="button"] + div { /* Content area */
+             padding: 1.2rem 1.5rem !important; background-color: var(--card-bg); border: none !important;
+         }
+         div[data-testid="stExpander"] div[role="button"] + div ul,
+         div[data-testid="stExpander"] div[role="button"] + div ol { margin-bottom: 0 !important; padding-left: 20px !important; list-style-type: disc; }
+         div[data-testid="stExpander"] div[role="button"] + div li {
+             color: var(--subtitle-text) !important; margin-bottom: 0.5rem !important; list-style-type: disc; padding-left: 0;
+         }
+         div[data-testid="stExpander"] div[role="button"] + div li::before { content: none !important; }
+         div[data-testid="stExpander"] a { color: var(--neon-green) !important; text-decoration: underline; }
+         div[data-testid="stExpander"] a:hover { text-shadow: 0 0 3px var(--neon-green); }
+         /* Footer Styling */
+         .footer {
+             margin-top: 4rem !important; padding: 1rem !important; font-size: 0.9rem !important;
+             color: var(--subtitle-text) !important; text-align: center; width: 100%;
+             position: relative; bottom: auto; left: auto;
+             border-top: 1px solid rgba(255, 255, 255, 0.1);
+         }
+        /* --- End Welcome Page Specific Styling --- */
+        /* --- Overview Tab Styling --- */
+        /* Style for st.metric containers */
+        div[data-testid="stMetric"] {
+            background-color: var(--card-bg); /* Use card background */
+            border: 1px solid var(--card-border); /* Use card border */
+            border-radius: 8px;
+            padding: 1rem 1.25rem;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.15);
+            transition: transform 0.2s ease-in-out, box-shadow 0.2s ease-in-out;
+            height: 100%; /* Ensure metrics in a row are same height */
+        }
+        div[data-testid="stMetric"]:hover {
+             transform: translateY(-2px); /* Lift effect */
+             box-shadow: 0 4px 8px rgba(0, 255, 163, 0.1); /* Neon glow */
+        }
+        /* Style for st.metric Label */
+        div[data-testid="stMetric"] label[data-testid="stMetricLabel"] {
+             color: var(--subtitle-text) !important; /* Dimmer label */
+             font-weight: 500 !important;
+             font-size: 0.95rem !important;
+        }
+        /* Style for st.metric Value */
+        div[data-testid="stMetric"] div[data-testid="stMetricValue"] {
+             color: var(--neon-green) !important; /* Neon value */
+             font-size: 2.5rem !important; /* Larger value */
+             font-weight: 700 !important;
+             padding-top: 5px;
+        }
+        div[data-testid="stMetric"] div[data-testid="stMetricDelta"] {
+             /* Style delta if you use it */
+             font-weight: 500 !important;
+        }
+        /* Section Headers in Overview Tab (Reuse H3 style) */
+        /* The general .stApp h3 rule should cover this if specific enough */
+        /* If not, target more specifically: */
+        div[data-testid="stVerticalBlock"] h3 { /* Assuming Overview content is in a vertical block */
+             font-size: 1.75rem !important;
+             font-weight: 600 !important;
+             color: var(--neon-green) !important;
+             border-bottom: 1px solid rgba(0, 255, 163, 0.3);
+             padding-bottom: 8px !important;
+             margin-top: 30px !important; /* Adjust spacing */
+             margin-bottom: 20px !important;
+        }
+        /* Reset for feature card H3 if needed */
+         .feature-card h3 {
+            font-size: 1.3rem !important;
+            border-bottom: none !important;
+            padding-bottom: 0 !important;
+         }
+        /* DataFrame Styling */
+        div[data-testid="stDataFrame"] {
+            border: 1px solid var(--card-border) !important; /* Neon border */
+            border-radius: 8px;
+            overflow: hidden; /* Ensures border radius applies to table */
+            box-shadow: 0 2px 4px rgba(0,0,0,0.2);
+        }
+        /* DataFrame Header */
+        div[data-testid="stDataFrame"] .col_heading {
+            background-color: var(--expander-header-bg) !important; /* Darker header */
+            color: var(--light-text) !important;
+            font-weight: 600 !important;
+            font-size: 0.95rem !important;
+            text-align: left !important;
+            border-bottom: 1px solid var(--neon-green) !important; /* Neon underline */
+        }
+        div[data-testid="stDataFrame"] .col_heading:first-of-type {
+            border-top-left-radius: 7px; /* Match container radius */
+        }
+        div[data-testid="stDataFrame"] .col_heading:last-of-type {
+            border-top-right-radius: 7px; /* Match container radius */
+        }
+        /* DataFrame Cells */
+        div[data-testid="stDataFrame"] .dataframe td,
+        div[data-testid="stDataFrame"] .dataframe th { /* Also style index header */
+             color: var(--subtitle-text) !important;
+             border-bottom: 1px solid rgba(255, 255, 255, 0.1) !important; /* Faint row separators */
+             border-right: none !important; /* Remove vertical separators */
+             padding: 0.5rem 0.75rem !important;
+             font-size: 0.9rem !important;
+        }
+        div[data-testid="stDataFrame"] .dataframe th { /* Index header specifically */
+            background-color: rgba(30, 30, 40, 0.3); /* Slightly different background for index */
+            color: var(--light-text) !important;
+            font-weight: 500 !important;
+        }
+        /* DataFrame Rows Hover */
+        div[data-testid="stDataFrame"] .dataframe tr:hover td,
+        div[data-testid="stDataFrame"] .dataframe tr:hover th {
+             background-color: rgba(0, 255, 163, 0.05) !important; /* Faint neon hover */
+             color: var(--light-text) !important;
+        }
+        /* --- End Overview Tab Styling --- */
+        /* --- Hide Streamlit elements --- */
+        #MainMenu {visibility: hidden !important;}
+        header {visibility: hidden !important;}
+        .stDeployButton {display: none !important;}
+        div[data-testid="stToolbar"] {display: none !important;}
+        div[data-testid="stDecoration"] {display: none !important;}
+        div[data-testid="stStatusWidget"] {display: none !important;}
+        /* --- End Hide Streamlit elements --- */
+        /* --- Sidebar styling --- */
+        section[data-testid="stSidebar"] > div:first-child {
+             background-color: var(--dark-bg) !important;
+        }
+        div[data-testid="stMetric"] { color: var(--light-text) !important; }
+        div[data-testid="stMetric"] > div { color: var(--light-text) !important; }
+        div[data-testid="stMetric"] label { color: var(--light-text) !important; }
+        /* --- End Remaining Styles --- */
+    </style>
+    """
+    st.markdown(css, unsafe_allow_html=True)

src/ui/footer.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+def show_footer():
+    """Display footer with copyright information."""
+    footer_html = """
+        <div class="footer">
+            © 2025 AutoML All Rights Reserved.
+        </div>
+    """
+    st.markdown(footer_html, unsafe_allow_html=True)

src/ui/insight.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+def display_ai_insights():
+    """Displays AI-Powered Insights and Data Cleaning Process."""
+    st.header("💡 AI-Powered Insights")
+    with st.expander("🧹 Data Cleaning Process", expanded=True):
+        if "insights" in st.session_state and "df" in st.session_state:
+            # Split insights into cleaning process and analysis
+            parts = st.session_state.insights.split("ANALYSIS INSIGHTS:")
+            # Show cleaning instructions
+            st.markdown(parts[0])
+            # Show interactive dataframe preview using st.session_state.df
+            st.subheader("Cleaned Data Sample")
+            st.dataframe(
+                st.session_state.df.head(),  # Use the existing df state
+                use_container_width=True,
+                hide_index=True,
+            )
+            # Show analysis insights if present
+            if len(parts) > 1:
+                st.markdown("---")
+                st.markdown("#### Analysis Insights")
+                st.markdown(parts[1])
+        else:
+            st.warning("No insights generated yet. Upload and process a file first.")

src/ui/loading.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import streamlit as st
+import time
+def show_loading_state():
+    """
+    Cyber-inspired loading animation with circuit-like effects
+    """
+    try:
+        st.html("""
+        <div class="loading-container-cyber">
+            <div class="rocket-animation-cyber">
+            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" class="cyber-rocket">
+                <path d="M50 10 L30 70 L50 65 L70 70 Z" fill="#00FFD1"/>
+                <path d="M40 80 L50 90 L60 80" stroke="#00FFD1" stroke-width="3" fill="none"/>
+            </svg>
+        </div>
+        <h1 class="title-cyber">AutoML</h1>
+        <h2 class="subtitle-cyber">You Ask , We Deliver</h2>
+        <div class="loading-content-cyber">
+            <p class="loading-text-cyber">Initializing neural networks...</p>
+            <div class="loading-bar-container-cyber">
+                <div class="loading-bar-cyber"></div>
+            </div>
+        </div>
+        <style>
+            body { background-color: #000000 !important; }
+            .loading-container-cyber {
+                display: flex;
+                flex-direction: column;
+                align-items: center;
+                justify-content: center;
+                min-height: 80vh;
+                text-align: center;
+                padding: 2rem;
+                background: radial-gradient(circle, rgba(0,0,0,1) 0%, rgba(0,0,0,1) 100%);
+            }
+            .cyber-rocket {
+                width: 100px;
+                height: 100px;
+                animation: pulse 2s infinite;
+            }
+            .title-cyber {
+                font-size: 3rem;
+                margin-bottom: 0.5rem;
+                color: #00FFD1;
+                text-shadow: 0 0 10px #00FFD1;
+                font-family: 'Orbitron', sans-serif;
+            }
+            .subtitle-cyber {
+                font-size: 1.5rem;
+                margin-bottom: 2rem;
+                color: #00A86B;
+                font-family: 'Chakra Petch', sans-serif;
+            }
+            .loading-content-cyber {
+                background: rgba(0, 255, 209, 0.05);
+                border: 1px solid rgba(0, 255, 209, 0.2);
+                padding: 1.5rem 2rem;
+                border-radius: 8px;
+                max-width: 600px;
+                width: 100%;
+            }
+            .loading-text-cyber {
+                margin: 0 0 1rem 0;
+                font-size: 1.1rem;
+                color: #00FFD1;
+                font-family: 'Chakra Petch', sans-serif;
+            }
+            .loading-bar-container-cyber {
+                height: 6px;
+                background: rgba(0, 255, 209, 0.2);
+                border-radius: 3px;
+                overflow: hidden;
+            }
+            .loading-bar-cyber {
+                height: 100%;
+                width: 30%;
+                background: linear-gradient(90deg, #00FFD1, #00A86B);
+                animation: circuit-load 1.5s cubic-bezier(0.4, 0.0, 0.2, 1) infinite;
+            }
+            @keyframes pulse {
+                0%, 100% { transform: scale(1); }
+                50% { transform: scale(1.1); }
+            }
+            @keyframes circuit-load {
+                0% { transform: translateX(-100%); box-shadow: 0 0 10px #00FFD1; }
+                50% { box-shadow: 0 0 20px #00FFD1; }
+                100% { transform: translateX(400%); box-shadow: 0 0 10px #00FFD1; }
+            }
+        </style>
+    </div> """)
+    except Exception as e:
+        # Fallback to built-in Streamlit spinner if custom animation fails
+        st.warning("Custom loading animation unavailable. Using default spinner...")
+        with st.spinner("Loading, please wait..."):
+            time.sleep(3)
+if __name__ == "__main__":
+    show_loading_state()
+    time.sleep(3)
+    st.empty()
+    st.success("App loaded successfully!")

src/ui/overview.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+import pandas as pd
+@st.cache_data
+def compute_column_info(df):
+    """Compute summary statistics for each column."""
+    return pd.DataFrame({
+        "Column": df.dtypes.index,
+        "Type": df.dtypes.astype(str),
+        "Non-Null Count": df.count(),
+        "Null Count": df.isnull().sum(),
+        "Unique Values": df.nunique(),
+    })
+def show_overview_page():
+    """Displays dataset statistics, preview, and column information."""
+    if "df" not in st.session_state or st.session_state.df is None:
+        st.warning("⚠️ No dataset loaded. Please upload a dataset first.")
+        return
+    df = st.session_state.df
+    # Dataset Statistics
+    st.markdown("## 📊 Dataset Statistics")
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Total Rows", len(df))
+    with col2:
+        st.metric("Total Columns", len(df.columns))
+    with col3:
+        numeric_count = len(df.select_dtypes(include=["int64", "float64"]).columns)
+        st.metric("Numeric Columns", numeric_count)
+    with col4:
+        categorical_count = len(df.select_dtypes(include=["object", "category"]).columns)
+        st.metric("Categorical Columns", categorical_count)
+    # Data Preview: Only display the top few rows
+    st.markdown("## 🔍 Data Preview")
+    st.dataframe(df.head(), use_container_width=True)
+    # Column Information: Use cached computation for faster loading
+    st.markdown("## 📌 Column Information")
+    dtypes_df = compute_column_info(df)
+    st.dataframe(dtypes_df, use_container_width=True)

src/ui/test_results.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import streamlit as st
+import io
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+from sklearn.metrics import (
+    accuracy_score,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix,
+    mean_absolute_error,
+    mean_squared_error,
+    r2_score,
+)
+# ==== LLM Setup with Caching ====
+@st.cache_resource(show_spinner=False)  # Disable default spinner
+def get_llm():
+    """Cached LLM initialization to prevent reloading on every rerun"""
+    from langchain_google_genai import ChatGoogleGenerativeAI
+    from langchain_groq import ChatGroq
+    import os
+    try:
+        return ChatGroq(
+            model="gemma2-9b-it",
+            groq_api_key=os.getenv("GROQ_API_KEY")
+        )
+    except Exception as e:
+        try:
+            return ChatGoogleGenerativeAI(
+                model="gemini-2.0-flash-lite-preview-02-05",
+                google_api_key=os.getenv("GEMINI_API_KEY")
+            )
+        except:
+            return None
+llm_insights = get_llm()
+# ==== Cached Metric Calculations ====
+@st.cache_data(show_spinner=False)  # Add to heavy computations
+def _compute_classification_metrics(y_test, y_pred):
+    """Cached metric computation for classification"""
+    return {
+        'accuracy': accuracy_score(y_test, y_pred),
+        'precision': precision_score(y_test, y_pred, average="weighted", zero_division=0),
+        'recall': recall_score(y_test, y_pred, average="weighted", zero_division=0),
+        'f1': f1_score(y_test, y_pred, average="weighted", zero_division=0),
+        'cm': confusion_matrix(y_test, y_pred)
+    }
+@st.cache_data
+def _compute_regression_metrics(y_test, y_pred):
+    """Cached metric computation for regression"""
+    return {
+        'mae': mean_absolute_error(y_test, y_pred),
+        'mse': mean_squared_error(y_test, y_pred),
+        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
+        'r2': r2_score(y_test, y_pred)
+    }
+# ==== Cached Visualization Generation ====
+@st.cache_data(show_spinner=False)  # Add to heavy computations
+def _plot_confusion_matrix(cm, classes):
+    """Cached confusion matrix plotting"""
+    fig, ax = plt.subplots(figsize=(2, 2), dpi=200)
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt="d",
+        cmap="Blues",
+        xticklabels=classes,
+        yticklabels=classes,
+        annot_kws={"size": 8},
+    )
+    plt.xticks(fontsize=5)
+    plt.yticks(fontsize=5)
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", bbox_inches="tight", dpi=200)
+    buf.seek(0)
+    return buf
+# ==== Optimized Insights Generation ====
+@st.cache_data(show_spinner=False)  # Add to heavy computations
+def _get_insights_classification(accuracy, precision, recall, f1, cm_shape):
+    """Cached insights generation based on metrics"""
+    if llm_insights is None:
+        return (
+            f"### Classification Metrics Explained\n\n"
+            f"**Accuracy** ({accuracy:.3f}): Correct predictions ratio\n"
+            f"**Precision** ({precision:.3f}): Positive prediction accuracy\n"
+            f"**Recall** ({recall:.3f}): Actual positives found\n"
+            f"**F1 Score** ({f1:.3f}): Precision-Recall balance\n"
+            f"Confusion Matrix ({cm_shape[0]}x{cm_shape[1]}): Prediction vs Actual distribution"
+        )
+    try:
+        response = llm_insights.invoke(f"""
+            Briefly explain these classification metrics (accuracy={accuracy:.3f},
+            precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f})
+            and {cm_shape[0]}x{cm_shape[1]} confusion matrix.
+            Use markdown bullet points.
+        """)
+        return response.content.strip()
+    except:
+        return "Could not generate AI insights - showing basic metrics explanation."
+def display_test_results(trained_model, X_test, y_test, task_type, label_encoder=None):
+    """
+    Displays test results, including metrics, confusion matrix (if classification),
+    and LLM-based or fallback insights about the metrics.
+    """
+    # Create a placeholder for the loading message at the top of the page
+    st.markdown("## Test Results")
+    loading_placeholder = st.empty()
+    # Show initial loading message
+    with loading_placeholder.container():
+        st.info("⏳ Evaluating model performance on test data. This may take a moment for large datasets.")
+        progress_bar = st.progress(0)
+    # Set a flag to track if results have been calculated
+    if "test_results_calculated" not in st.session_state:
+        st.session_state.test_results_calculated = False
+    # Only perform calculations if they haven't been done yet
+    if not st.session_state.test_results_calculated:
+        sampling_message = None
+        MAX_SAMPLES = 5000  # Increased from 50 to 5000
+        # Update progress - Starting evaluation
+        with loading_placeholder.container():
+            progress_bar.progress(10)
+            if len(X_test) <= MAX_SAMPLES:
+                # Use all test data
+                X_test_sample = X_test
+                y_test_sample = y_test
+                st.info("🔍 Using all test data for evaluation...")
+            else:
+                # Use sampling for large datasets
+                sampling_message = f"📊 Using {MAX_SAMPLES} samples from the test set for visualization (out of {len(X_test)} total)"
+                st.info("🔍 Sampling test data for evaluation...")
+                # Simple random sampling
+                idx = np.random.choice(len(X_test.index if hasattr(X_test, 'index') else X_test), size=MAX_SAMPLES, replace=False)
+                X_test_sample = X_test.iloc[idx] if hasattr(X_test, 'iloc') else X_test[idx]
+                y_test_sample = y_test.iloc[idx] if hasattr(y_test, 'iloc') else y_test[idx]
+        # Generate predictions
+        with loading_placeholder.container():
+            progress_bar.progress(30)
+            st.info("🔄 Generating predictions... Please wait")
+            # Add a spinner for visual feedback during prediction
+            with st.spinner("Model working..."):
+                if task_type == "regression":
+                    y_pred = trained_model.predict(X_test_sample)
+                elif task_type == "classification":
+                    pipeline, enc = trained_model if label_encoder is None else (trained_model, label_encoder)
+                    y_pred = pipeline.predict(X_test_sample)
+                    # Decode if label_encoder is used
+                    if enc:
+                        y_pred = enc.inverse_transform(y_pred)
+                        y_test_decoded = enc.inverse_transform(y_test_sample)
+                    else:
+                        y_test_decoded = y_test_sample
+        # Update progress - Computing metrics
+        with loading_placeholder.container():
+            progress_bar.progress(60)
+            st.info("📊 Computing metrics...")
+        # Compute metrics
+        if task_type == "regression":
+            metrics = _compute_regression_metrics(y_test_sample, y_pred)
+        else:
+            metrics = _compute_classification_metrics(y_test_decoded, y_pred)
+        # Update progress - Preparing visualizations
+        with loading_placeholder.container():
+            progress_bar.progress(90)
+            st.info("📈 Preparing visualizations...")
+            # For classification, pre-calculate confusion matrix before showing "ready" message
+            if task_type == "classification":
+                # Pre-calculate confusion matrix (this is the slow part)
+                _ = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded))
+                # Pre-calculate insights (also potentially slow with LLM)
+                _ = _get_insights_classification(
+                    metrics['accuracy'],
+                    metrics['precision'],
+                    metrics['recall'],
+                    metrics['f1'],
+                    metrics['cm'].shape
+                )
+        # Update progress - Complete (only after all calculations are done)
+        with loading_placeholder.container():
+            progress_bar.progress(100)
+            st.success("✅ Test results ready!")
+        # Mark results as calculated
+        st.session_state.test_results_calculated = True
+        # Store results in session state for reuse
+        st.session_state.test_metrics = metrics
+        if task_type == "classification":
+            st.session_state.test_y_pred = y_pred
+            st.session_state.test_y_test = y_test_decoded
+        else:
+            st.session_state.test_y_pred = y_pred
+            st.session_state.test_y_test = y_test_sample
+        # Store sampling message
+        st.session_state.sampling_message = sampling_message
+        # Import time only when needed (moved from global to local scope)
+        import time
+        time.sleep(0.5)  # Short delay to show the "Test results ready!" message
+    # Display sampling message if it exists
+    if "sampling_message" in st.session_state and st.session_state.sampling_message:
+        st.info(st.session_state.sampling_message)
+    # Display the results using stored values
+    if task_type == "regression":
+        st.subheader("🔍 Regression Metrics")
+        # Get metrics from session state or use the ones we just calculated
+        if "test_metrics" in st.session_state and st.session_state.test_results_calculated:
+            metrics = st.session_state.test_metrics
+            y_pred = st.session_state.test_y_pred
+            y_test = st.session_state.test_y_test
+        mae, mse, rmse, r2 = metrics['mae'], metrics['mse'], np.sqrt(metrics['mse']), metrics['r2']
+        col1, col2, col3, col4 = st.columns(4)
+        col1.metric("📉 MAE", f"{mae:.4f}")
+        col2.metric("📊 MSE", f"{mse:.4f}")
+        col3.metric("📈 RMSE", f"{rmse:.4f}")
+        col4.metric("📌 R² Score", f"{r2:.4f}")
+        # Add regression visualization
+        st.subheader("📈 Prediction vs Actual")
+        df_results = pd.DataFrame({
+            'Actual': y_test,
+            'Predicted': y_pred
+        })
+        fig = px.scatter(df_results, x='Actual', y='Predicted',
+                        title='Predicted vs Actual Values',
+                        labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'})
+        fig.add_shape(type='line', x0=min(y_test), y0=min(y_test),
+                    x1=max(y_test), y1=max(y_test),
+                    line=dict(color='red', dash='dash'))
+        st.plotly_chart(fig, use_container_width=True)
+    elif task_type == "classification":
+        st.subheader("🔍 Classification Metrics")
+        # Get metrics from session state or use the ones we just calculated
+        if "test_metrics" in st.session_state and st.session_state.test_results_calculated:
+            metrics = st.session_state.test_metrics
+            y_pred = st.session_state.test_y_pred
+            y_test_decoded = st.session_state.test_y_test
+        accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1']
+        col1, col2, col3, col4 = st.columns(4)
+        col1.metric("✅ Accuracy", f"{accuracy:.4f}")
+        col2.metric("🎯 Precision", f"{precision:.4f}")
+        col3.metric("📢 Recall", f"{recall:.4f}")
+        col4.metric("🔥 F1 Score", f"{f1:.4f}")
+        st.subheader("📊 Confusion Matrix")
+        # Use cached function for confusion matrix visualization
+        buf = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded))
+        st.image(buf, width=450)
+        # === Additional Insights Section ===
+        st.markdown("---")
+        st.markdown("#### Test Insights")
+        accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1']
+        classification_insights = _get_insights_classification(accuracy, precision, recall, f1, metrics['cm'].shape)
+        st.markdown(classification_insights)

src/ui/visualization.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import re
+import streamlit as st
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import pandas as pd
+import numpy as np
+from src.utils.logging import log_frontend_error, log_frontend_warning
+SAMPLE_SIZE = 10000  # Define a sample size for subsampling large datasets
+# Efficiently hash a dataframe to detect changes
+@st.cache_data(show_spinner=False)
+def compute_df_hash(df):
+    """Optimized dataframe hashing"""
+    return hash((df.shape, pd.util.hash_pandas_object(df.iloc[:min(100, len(df))]).sum()))  # Sample-based hashing
+@st.cache_data(show_spinner=False, ttl=3600)  # Cache for 1 hour
+def is_potential_date_column(series, sample_size=5):
+    """Check if column might contain dates"""
+    # Check column name first
+    if any(keyword in series.name.lower() for keyword in ['date', 'time', 'year', 'month', 'day']):
+        return True
+    # Check sample values
+    sample = series.dropna().head(sample_size).astype(str)
+    date_patterns = [
+        r'\d{4}-\d{2}-\d{2}',      # YYYY-MM-DD
+        r'\d{2}/\d{2}/\d{4}',      # MM/DD/YYYY
+        r'\d{2}-\w{3}-\d{2,4}',    # DD-MON-YY(Y)
+        r'\d{1,2} \w{3,} \d{4}'    # 1 January 2023
+    ]
+    date_count = sum(1 for val in sample if any(re.match(p, val) for p in date_patterns))
+    return date_count / len(sample) > 0.5 if len(sample) > 0 else False  # >50% match
+# Cache column type detection with improved performance
+@st.cache_data(show_spinner=False, ttl=3600)  # Cache for 1 hour
+def get_column_types(df):
+    """Detect column types efficiently and cache the results."""
+    column_types = {}
+    # Process columns in batches for better performance
+    for chunk_start in range(0, len(df.columns), 10):
+        chunk_end = min(chunk_start + 10, len(df.columns))
+        chunk_columns = df.columns[chunk_start:chunk_end]
+        for column in chunk_columns:
+            # Check for numeric columns
+            if pd.api.types.is_numeric_dtype(df[column]):
+                # Detect if it's a binary column (0/1, True/False)
+                if df[column].nunique() <= 2:
+                    column_types[column] = "BINARY"
+                # Detect if it's a discrete numeric column (few unique values)
+                elif df[column].nunique() < 20:
+                    column_types[column] = "NUMERIC_DISCRETE"
+                # Otherwise it's a continuous numeric column
+                else:
+                    column_types[column] = "NUMERIC_CONTINUOUS"
+            else:
+                # Check for temporal/date columns
+                if is_potential_date_column(df[column]):
+                    try:
+                        # Attempt conversion with coerce
+                        converted = pd.to_datetime(df[column], errors='coerce')
+                        if not converted.isnull().all():  # At least some valid dates
+                            column_types[column] = "TEMPORAL"
+                            continue
+                    except Exception:
+                        pass
+                # Check for ID-like columns (high cardinality with unique patterns)
+                if (df[column].nunique() > len(df) * 0.9 and
+                    any(x in column.lower() for x in ['id', 'code', 'key', 'uuid', 'identifier'])):
+                    column_types[column] = "ID"
+                # Check for categorical columns (low to medium cardinality)
+                elif df[column].nunique() <= 20:
+                    column_types[column] = "CATEGORICAL"
+                # Otherwise it's a text column
+                else:
+                    column_types[column] = "TEXT"
+    return column_types
+# Cache correlation matrix computation with improved performance
+@st.cache_data(show_spinner=False, ttl=3600)  # Cache for 1 hour
+def get_corr_matrix(df):
+    """Compute and cache the correlation matrix for numeric columns."""
+    # Only select numeric columns to avoid errors
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    # If we have too many numeric columns, sample them for better performance
+    if len(numeric_cols) > 30:
+        numeric_cols = numeric_cols[:30]
+    # Return correlation matrix if we have at least 2 numeric columns
+    return df[numeric_cols].corr() if len(numeric_cols) > 1 else None
+# Cache subsampled data with improved performance
+@st.cache_data(show_spinner=False, ttl=3600)  # Cache for 1 hour
+def get_subsampled_data(df, column):
+    """Return subsampled data for faster visualization."""
+    # Check if column exists
+    if column not in df.columns:
+        return pd.DataFrame()
+    # Use stratified sampling for categorical columns if possible
+    if df[column].nunique() < 20 and len(df) > SAMPLE_SIZE:
+        try:
+            # Try to get a representative sample
+            fractions = min(0.5, SAMPLE_SIZE / len(df))
+            return df[[column]].groupby(column, group_keys=False).apply(
+                lambda x: x.sample(max(1, int(fractions * len(x))), random_state=42)
+            )
+        except Exception:
+            # Fall back to random sampling
+            pass
+    # Use random sampling
+    return df[[column]].sample(min(len(df), SAMPLE_SIZE), random_state=42)
+# Cache chart creation with improved performance
+@st.cache_data(show_spinner=False, ttl=1800, hash_funcs={  # Cache for 30 minutes
+    pd.DataFrame: compute_df_hash,
+    pd.Series: lambda s: hash((s.name, compute_df_hash(s.to_frame())))
+})
+def create_chart(df, column, column_type):
+    """Generate optimized charts based on column type."""
+    # Check if column exists in the dataframe
+    if column not in df.columns:
+        return None
+    # Get subsampled data for better performance
+    df_sample = get_subsampled_data(df, column)
+    if df_sample.empty:
+        return None
+    try:
+        # Year-based columns (special case)
+        if "year" in column.lower():
+            fig = make_subplots(rows=1, cols=2, subplot_titles=("Year Distribution", "Box Plot"),
+                               specs=[[{"type": "bar"}, {"type": "box"}]], column_widths=[0.7, 0.3], horizontal_spacing=0.1)
+            year_counts = df_sample[column].value_counts().sort_index()
+            fig.add_trace(go.Bar(x=year_counts.index, y=year_counts.values, marker_color='#7B68EE'), row=1, col=1)
+            fig.add_trace(go.Box(x=df_sample[column], marker_color='#7B68EE'), row=1, col=2)
+        # Binary columns (0/1, True/False)
+        elif column_type == "BINARY":
+            value_counts = df_sample[column].value_counts()
+            fig = make_subplots(rows=1, cols=2,
+                               subplot_titles=("Distribution", "Percentage"),
+                               specs=[[{"type": "bar"}, {"type": "pie"}]],
+                               column_widths=[0.5, 0.5],
+                               horizontal_spacing=0.1)
+            fig.add_trace(go.Bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                marker_color=['#FF4B4B', '#4CAF50'],
+                text=value_counts.values,
+                textposition='auto'
+            ), row=1, col=1)
+            fig.add_trace(go.Pie(
+                labels=value_counts.index,
+                values=value_counts.values,
+                marker=dict(colors=['#FF4B4B', '#4CAF50']),
+                textinfo='percent+label'
+            ), row=1, col=2)
+            fig.update_layout(title_text=f"Binary Distribution: {column}")
+        # Numeric continuous columns
+        elif column_type == "NUMERIC_CONTINUOUS":
+            fig = make_subplots(rows=2, cols=2,
+                               subplot_titles=("Distribution", "Box Plot", "Violin Plot", "Cumulative Distribution"),
+                               specs=[[{"type": "histogram"}, {"type": "box"}],
+                                      [{"type": "violin"}, {"type": "scatter"}]],
+                               vertical_spacing=0.15,
+                               horizontal_spacing=0.1)
+            # Histogram
+            fig.add_trace(go.Histogram(
+                x=df_sample[column],
+                nbinsx=30,
+                marker_color='#FF4B4B',
+                opacity=0.7
+            ), row=1, col=1)
+            # Box plot
+            fig.add_trace(go.Box(
+                x=df_sample[column],
+                marker_color='#FF4B4B',
+                boxpoints='outliers'
+            ), row=1, col=2)
+            # Violin plot
+            fig.add_trace(go.Violin(
+                x=df_sample[column],
+                marker_color='#FF4B4B',
+                box_visible=True,
+                points='outliers'
+            ), row=2, col=1)
+            # CDF
+            sorted_data = np.sort(df_sample[column].dropna())
+            cumulative = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
+            fig.add_trace(go.Scatter(
+                x=sorted_data,
+                y=cumulative,
+                mode='lines',
+                line=dict(color='#FF4B4B', width=2)
+            ), row=2, col=2)
+            fig.update_layout(height=600, title_text=f"Continuous Variable Analysis: {column}")
+        # Numeric discrete columns
+        elif column_type == "NUMERIC_DISCRETE":
+            value_counts = df_sample[column].value_counts().sort_index()
+            fig = make_subplots(rows=1, cols=2,
+                               subplot_titles=("Distribution", "Percentage"),
+                               specs=[[{"type": "bar"}, {"type": "pie"}]],
+                               column_widths=[0.7, 0.3],
+                               horizontal_spacing=0.1)
+            fig.add_trace(go.Bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                marker_color='#FF4B4B',
+                text=value_counts.values,
+                textposition='auto'
+            ), row=1, col=1)
+            fig.add_trace(go.Pie(
+                labels=value_counts.index,
+                values=value_counts.values,
+                marker=dict(colors=px.colors.sequential.Reds),
+                textinfo='percent+label'
+            ), row=1, col=2)
+            fig.update_layout(title_text=f"Discrete Numeric Distribution: {column}")
+        # Categorical columns
+        elif column_type == "CATEGORICAL":
+            value_counts = df_sample[column].value_counts().head(20)  # Limit to top 20 categories
+            fig = make_subplots(rows=1, cols=2,
+                               subplot_titles=("Category Distribution", "Percentage Breakdown"),
+                               specs=[[{"type": "bar"}, {"type": "pie"}]],
+                               column_widths=[0.6, 0.4],
+                               horizontal_spacing=0.1)
+            # Bar chart
+            fig.add_trace(go.Bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                marker_color='#00FFA3',
+                text=value_counts.values,
+                textposition='auto'
+            ), row=1, col=1)
+            # Pie chart
+            fig.add_trace(go.Pie(
+                labels=value_counts.index,
+                values=value_counts.values,
+                marker=dict(colors=px.colors.sequential.Greens),
+                textinfo='percent+label'
+            ), row=1, col=2)
+            fig.update_layout(title_text=f"Categorical Analysis: {column}")
+        # Temporal/date columns
+        elif column_type == "TEMPORAL":
+            # Convert with safe datetime parsing
+            dates = pd.to_datetime(df_sample[column], errors='coerce', format='mixed')
+            valid_dates = dates[dates.notna()]
+            fig = make_subplots(
+                rows=2,
+                cols=2,
+                subplot_titles=("Monthly Pattern", "Yearly Pattern", "Cumulative Trend", "Day of Week Distribution"),
+                vertical_spacing=0.15,
+                horizontal_spacing=0.1,
+                specs=[[{"type": "bar"}, {"type": "bar"}],
+                       [{"type": "scatter"}, {"type": "bar"}]]
+            )
+            # Monthly pattern
+            if not valid_dates.empty:
+                monthly_counts = valid_dates.dt.month.value_counts().sort_index()
+                month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+                month_labels = [month_names[i-1] for i in monthly_counts.index]
+                fig.add_trace(go.Bar(
+                    x=month_labels,
+                    y=monthly_counts.values,
+                    marker_color='#7B68EE',
+                    text=monthly_counts.values,
+                    textposition='auto'
+                ), row=1, col=1)
+                # Yearly pattern
+                yearly_counts = valid_dates.dt.year.value_counts().sort_index()
+                fig.add_trace(go.Bar(
+                    x=yearly_counts.index,
+                    y=yearly_counts.values,
+                    marker_color='#7B68EE',
+                    text=yearly_counts.values,
+                    textposition='auto'
+                ), row=1, col=2)
+                # Cumulative trend
+                sorted_dates = valid_dates.sort_values()
+                cumulative = np.arange(1, len(sorted_dates) + 1)
+                fig.add_trace(go.Scatter(
+                    x=sorted_dates,
+                    y=cumulative,
+                    mode='lines',
+                    line=dict(color='#7B68EE', width=2)
+                ), row=2, col=1)
+                # Day of week distribution
+                dow_counts = valid_dates.dt.dayofweek.value_counts().sort_index()
+                dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
+                dow_labels = [dow_names[i] for i in dow_counts.index]
+                fig.add_trace(go.Bar(
+                    x=dow_labels,
+                    y=dow_counts.values,
+                    marker_color='#7B68EE',
+                    text=dow_counts.values,
+                    textposition='auto'
+                ), row=2, col=2)
+            fig.update_layout(height=600, title_text=f"Temporal Analysis: {column}")
+        # ID columns (show distribution of first few characters, length distribution)
+        elif column_type == "ID":
+            # Calculate ID length statistics
+            id_lengths = df_sample[column].astype(str).str.len()
+            # Extract first 2 characters for prefix analysis
+            id_prefixes = df_sample[column].astype(str).str[:2].value_counts().head(15)
+            fig = make_subplots(
+                rows=1,
+                cols=2,
+                subplot_titles=("ID Length Distribution", "Common ID Prefixes"),
+                horizontal_spacing=0.1,
+                specs=[[{"type": "histogram"}, {"type": "bar"}]]
+            )
+            # ID length histogram
+            fig.add_trace(go.Histogram(
+                x=id_lengths,
+                nbinsx=20,
+                marker_color='#9C27B0'
+            ), row=1, col=1)
+            # ID prefix bar chart
+            fig.add_trace(go.Bar(
+                x=id_prefixes.index,
+                y=id_prefixes.values,
+                marker_color='#9C27B0',
+                text=id_prefixes.values,
+                textposition='auto'
+            ), row=1, col=2)
+            fig.update_layout(title_text=f"ID Analysis: {column}")
+        # Text columns
+        elif column_type == "TEXT":
+            # For text columns, show top values and length distribution
+            value_counts = df_sample[column].value_counts().head(15)
+            # Calculate text length statistics
+            text_lengths = df_sample[column].astype(str).str.len()
+            fig = make_subplots(
+                rows=2,
+                cols=1,
+                subplot_titles=("Top Values", "Text Length Distribution"),
+                vertical_spacing=0.2,
+                specs=[[{"type": "bar"}], [{"type": "histogram"}]]
+            )
+            # Top values bar chart
+            fig.add_trace(
+                go.Bar(
+                    x=value_counts.index,
+                    y=value_counts.values,
+                    marker_color='#00B4D8',
+                    text=value_counts.values,
+                    textposition='auto'
+                ),
+                row=1, col=1
+            )
+            # Text length histogram
+            fig.add_trace(
+                go.Histogram(
+                    x=text_lengths,
+                    nbinsx=20,
+                    marker_color='#00B4D8'
+                ),
+                row=2, col=1
+            )
+            fig.update_layout(
+                height=600,
+                title_text=f"Text Analysis: {column}"
+            )
+        # Fallback for any other column type
+        else:
+            fig = go.Figure(go.Histogram(x=df_sample[column], marker_color='#888'))
+            fig.update_layout(title_text=f"Generic Analysis: {column}")
+        # Common layout settings
+        fig.update_layout(
+            height=400,
+            showlegend=False,
+            plot_bgcolor='rgba(0,0,0,0)',
+            paper_bgcolor='rgba(0,0,0,0)',
+            font=dict(color='#FFFFFF'),
+            margin=dict(l=40, r=40, t=50, b=40)
+        )
+        return fig
+    except Exception as e:
+        log_frontend_error("Chart Generation", f"Error creating chart for {column}: {str(e)}")
+        return None
+def visualize_data(df):
+    """Automated dashboard with optimized visualizations."""
+    if df is None or df.empty:
+        st.error("❌ No data available. Please upload and clean your data first.")
+        return
+    # Calculate dataframe hash only once
+    df_hash = compute_df_hash(df)
+    # Initialize selected columns in session state if not already present
+    if "selected_viz_columns" not in st.session_state:
+        # Initialize with first 4 columns or fewer if df has fewer columns
+        initial_columns = list(df.columns[:min(4, len(df.columns))])
+        st.session_state.selected_viz_columns = initial_columns
+    # Filter out any columns that no longer exist in the dataframe
+    valid_columns = [col for col in st.session_state.selected_viz_columns if col in df.columns]
+    # Define a callback function to update selected columns
+    def on_column_selection_change():
+        # Store the selected columns in session state
+        st.session_state.selected_viz_columns = st.session_state.viz_column_selector
+        # Ensure we stay on the visualization tab (index 2)
+        st.session_state.current_tab_index = 2
+    # Use session state for the multiselect with a consistent key and callback
+    selected_columns = st.multiselect(
+        "Select columns to visualize",
+        options=df.columns,
+        default=valid_columns,
+        key="viz_column_selector",
+        on_change=on_column_selection_change
+    )
+    # Check if we need to recompute column types and correlation matrix
+    # This will only happen when:
+    # 1. We don't have column_types in session_state
+    # 2. The dataframe hash has changed (new data)
+    # 3. We're using a user-uploaded dataset for the first time
+    recompute_needed = (
+        "column_types" not in st.session_state or
+        "df_hash" not in st.session_state or
+        st.session_state.get("df_hash") != df_hash
+    )
+    if recompute_needed:
+        with st.spinner("🔄 Analyzing data structure..."):
+            # Compute and cache column types
+            st.session_state.column_types = get_column_types(df)
+            # Compute and cache correlation matrix
+            st.session_state.corr_matrix = get_corr_matrix(df)
+            # Update the dataframe hash
+            st.session_state.df_hash = df_hash
+            # Ensure we stay on the visualization tab
+            st.session_state.current_tab_index = 2
+            # Reset any test results if the data has changed
+            if "test_results_calculated" in st.session_state:
+                st.session_state.test_results_calculated = False
+                # Clear any previous test metrics to avoid using stale data
+                for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']:
+                    if key in st.session_state:
+                        del st.session_state[key]
+    # Use cached values from session state
+    column_types = st.session_state.column_types
+    corr_matrix = st.session_state.corr_matrix
+    if selected_columns:
+        # Use a container to wrap all visualizations
+        viz_container = st.container()
+        with viz_container:
+            for idx in range(0, len(selected_columns), 2):
+                col1, col2 = st.columns(2)
+                for i, col in enumerate([col1, col2]):
+                    if idx + i < len(selected_columns):
+                        column = selected_columns[idx + i]
+                        with col:
+                            # Use consistent keys for charts based on column name
+                            chart_key = f"plot_{column.replace(' ', '_')}"
+                            # Only create chart if column exists in column_types
+                            if column in column_types:
+                                fig = create_chart(df, column, column_types[column])
+                                if fig:
+                                    st.plotly_chart(fig, use_container_width=True, key=chart_key)
+                                    with st.expander(f"📊 Summary Statistics - {column}", expanded=False):
+                                        if "NUMERIC" in column_types[column]:
+                                            st.dataframe(df[column].describe(), key=f"stats_{column.replace(' ', '_')}")
+                                        else:
+                                            st.dataframe(df[column].value_counts(), key=f"counts_{column.replace(' ', '_')}")
+                            else:
+                                st.warning(f"⚠️ Column '{column}' not found in the dataset or its type couldn't be determined.")
+            if corr_matrix is not None:
+                st.subheader("🔗 Correlation Analysis")
+                fig = px.imshow(corr_matrix, title="Correlation Matrix", color_continuous_scale="RdBu")
+                st.plotly_chart(fig, use_container_width=True, key="corr_matrix_plot")
+    else:
+        st.info("👆 Please select columns to visualize")

src/ui/welcome.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import streamlit as st
+from src.preprocessing.clean_data import cached_clean_csv
+import pandas as pd
+from functools import lru_cache
+# Cache static content to avoid recomputation
+@lru_cache(maxsize=1)
+def get_static_content():
+    """Cache static HTML content to avoid regeneration."""
+    welcome_header = """
+        <div class="welcome-header" style="text-align: left; margin-bottom: 2rem;">
+            <h1>Experience Ai like never before</h1>
+            <p class="subtitle">
+                Performance, Analysis, Insights Made Simple.
+            </p>
+        </div>
+    """
+    features_header = "## ✨ Key Features"
+    feature_cards = [
+        """
+        <div class="feature-card">
+            <h3>📊 Data Analysis</h3>
+            <ul>
+                <li>Automated data cleaning</li>
+                <li>Interactive visualizations</li>
+                <li>Statistical insights</li>
+                <li>Correlation analysis</li>
+            </ul>
+        </div>
+        """,
+        """
+        <div class="feature-card">
+            <h3>🤖 Machine Learning</h3>
+            <ul>
+                <li>Multiple ML algorithms</li>
+                <li>Automated model selection</li>
+                <li>Hyperparameter tuning</li>
+                <li>Performance metrics</li>
+            </ul>
+        </div>
+        """,
+        """
+        <div class="feature-card">
+            <h3>🔍 AI Insights</h3>
+            <ul>
+                <li>Data quality checks</li>
+                <li>Feature importance</li>
+                <li>Model explanations</li>
+                <li>Smart recommendations</li>
+            </ul>
+        </div>
+        """
+    ]
+    getting_started = """
+    ## 🚀 Getting Started
+    1. **Upload Your Dataset**: Use the sidebar to upload your CSV file
+    2. **Explore Data**: View statistics and visualizations in the Overview tab
+    3. **Train Models**: Select algorithms and tune parameters
+    4. **Get Insights**: Receive AI-powered recommendations
+    """
+    dataset_requirements = """
+    * File format: CSV
+    * Maximum size: 200MB
+    * Supported column types:
+        * Numeric (int, float)
+        * Categorical (string, boolean)
+        * Temporal (date, datetime)
+    * Clean data preferred, but not required
+    """
+    example_datasets = """
+    Try these example datasets to explore the app:
+    * [Iris Dataset](https://archive.ics.uci.edu/ml/datasets/iris)
+    * [Boston Housing](https://www.kaggle.com/c/boston-housing)
+    * [Wine Quality](https://archive.ics.uci.edu/ml/datasets/wine+quality)
+    """
+    return welcome_header, features_header, feature_cards, getting_started, dataset_requirements, example_datasets
+def show_welcome_page():
+    """Display welcome page with features and instructions efficiently."""
+    # Load cached static content
+    welcome_header, features_header, feature_cards, getting_started, dataset_requirements, example_datasets = get_static_content()
+    # Render static content
+    st.markdown(welcome_header, unsafe_allow_html=True)
+    st.markdown(features_header, unsafe_allow_html=True)
+    # Feature columns with minimal overhead
+    col1, col2, col3 = st.columns(3, gap="medium")
+    with col1:
+        st.markdown(feature_cards[0], unsafe_allow_html=True)
+    with col2:
+        st.markdown(feature_cards[1], unsafe_allow_html=True)
+    with col3:
+        st.markdown(feature_cards[2], unsafe_allow_html=True)
+    st.markdown("<br>", unsafe_allow_html=True)  # Spacing
+    # Getting Started and Expanders
+    st.markdown(getting_started, unsafe_allow_html=True)
+    with st.expander("📋 Dataset Requirements"):
+        st.markdown(dataset_requirements)
+    with st.expander("🎯 Example Datasets"):
+        st.markdown(example_datasets)
+     #  New File Uploader Section
+    st.markdown("### 📤 Upload Your Dataset (Currently Using Default Dataset)")
+    # Add a checkbox to indicate if the dataset is already cleaned
+    skip_cleaning = st.checkbox("My dataset is already cleaned (skip cleaning)")
+    uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
+    if uploaded_file is not None:
+        try:
+            # Validate file size
+            file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
+            if uploaded_file.size > 200 * 1024 * 1024:  # 200MB limit
+                st.error("❌ File size exceeds 200MB limit. Please upload a smaller file.")
+                return
+            # Attempt to read the CSV
+            try:
+                df = pd.read_csv(uploaded_file)
+                if df.empty:
+                    st.error("❌ The uploaded file is empty. Please upload a file with data.")
+                    return
+                st.success("✅ Dataset uploaded successfully!")
+            except pd.errors.EmptyDataError:
+                st.error("❌ The uploaded file is empty. Please upload a file with data.")
+                return
+            except pd.errors.ParserError:
+                st.error("❌ Unable to parse the CSV file. Please ensure it's properly formatted.")
+                return
+            # Convert dataframe to JSON for caching
+            df_json = df.to_json(orient='records')
+            # Use the cached cleaning function with proper error handling
+            with st.spinner("🧠 AI is analyzing and cleaning the data..." if not skip_cleaning else "Processing dataset..."):
+                try:
+                    cleaned_df, insights = cached_clean_csv(df_json, skip_cleaning)
+                except Exception as cleaning_error:
+                    st.error(f"❌ Error during data cleaning: {str(cleaning_error)}")
+                    # Fallback to using the original dataframe
+                    st.warning("⚠️ Using original dataset without cleaning due to errors.")
+                    cleaned_df = df
+                    insights = "Cleaning failed, using original data."
+            # Save results to session state
+            st.session_state.df = cleaned_df
+            st.session_state.insights = insights
+            st.session_state.data_cleaned = True
+            st.session_state.dataset_loaded = True
+            # Store a flag to indicate this is a user-uploaded dataset
+            st.session_state.is_user_uploaded = True
+            # Store the original dataframe JSON and skip_cleaning preference
+            # This helps prevent redundant cleaning
+            st.session_state.original_df_json = df_json
+            st.session_state.skip_cleaning = skip_cleaning
+            # Reset visualization and model training related session state
+            if "column_types" in st.session_state:
+                del st.session_state.column_types
+            if "corr_matrix" in st.session_state:
+                del st.session_state.corr_matrix
+            if "df_hash" in st.session_state:
+                del st.session_state.df_hash
+            if "test_results_calculated" in st.session_state:
+                st.session_state.test_results_calculated = False
+            if skip_cleaning:
+                st.success("✅ Using uploaded dataset as-is (skipped cleaning).")
+            else:
+                st.success("✅ Data cleaned successfully!")
+        except Exception as e:
+            st.error(f"❌ Error processing dataset: {str(e)}")
+            st.info("ℹ️ Please check that your file is a valid CSV and try again.")

src/utils/logging.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import logging
+import os
+from datetime import datetime
+# Create logs directory if it doesn't exist
+logs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs')
+if not os.path.exists(logs_dir):
+    os.makedirs(logs_dir)
+# Configure the logger
+def setup_logger():
+    """
+    Set up and configure the frontend error logger.
+    """
+    # Create a logger instance
+    logger = logging.getLogger('frontend_logger')
+    logger.setLevel(logging.DEBUG)
+    # Create a file handler
+    log_file = os.path.join(logs_dir, f'frontend_errors_{datetime.now().strftime("%Y%m%d")}.log')
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    # Create a console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.ERROR)
+    # Create a formatter
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    console_handler.setFormatter(formatter)
+    # Add handlers to logger
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    return logger
+# Initialize logger
+frontend_logger = setup_logger()
+def log_frontend_error(error_type: str, error_message: str, additional_info: dict = None):
+    """
+    Log frontend errors with detailed information.
+    Args:
+        error_type (str): Type of error (e.g., 'Arrow Conversion', 'Model Training', etc.)
+        error_message (str): The error message
+        additional_info (dict, optional): Additional context about the error
+    """
+    error_details = f"Type: {error_type}\nMessage: {error_message}"
+    if additional_info:
+        error_details += f"\nAdditional Info: {additional_info}"
+    frontend_logger.error(error_details)
+def log_frontend_warning(warning_type: str, warning_message: str, additional_info: dict = None):
+    """
+    Log frontend warnings with detailed information.
+    Args:
+        warning_type (str): Type of warning
+        warning_message (str): The warning message
+        additional_info (dict, optional): Additional context about the warning
+    """
+    warning_details = f"Type: {warning_type}\nMessage: {warning_message}"
+    if additional_info:
+        warning_details += f"\nAdditional Info: {additional_info}"
+    frontend_logger.warning(warning_details)