import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import os
import pickle

# App title
st.title("Advanced Data Science Workflow with Auto ML 🚀📊")
st.write("Automate your data science workflow with feature engineering, model training, and deployment!")

# File upload
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file:
    st.write("Debug: File uploaded successfully!")
    
    # Load the dataset
    df = pd.read_csv(uploaded_file)
    st.write("### Data Preview")
    st.dataframe(df.head())

    # Data profiling
    st.write("### Dataset Summary")
    st.write(df.describe())

    # Missing values
    st.write("### Missing Value Analysis")
    st.write(df.isnull().sum())

    # Feature engineering
    st.write("### Feature Engineering")
    st.write("Choose the target column for model training:")
    target_column = st.selectbox("Target Column", df.columns)

    # Handle categorical data
    st.write("### Categorical Encoding")
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        st.write(f"Encoding column: {col}")
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))

    # Handle missing values
    st.write("### Handle Missing Values")
    df = df.fillna(df.median())
    st.write("Missing values filled with column medians.")

    # Column selection
    st.write("### Select Features")
    selected_columns = st.multiselect("Select columns to keep for training", options=df.columns, default=df.columns)
    df = df[selected_columns]
    st.write("Updated dataset:")
    st.dataframe(df.head())

    # Split dataset into train and test
    st.write("### Train-Test Split")
    test_size = st.slider("Select test size (in %)", min_value=10, max_value=50, value=20, step=5)
    test_size = test_size / 100

    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    st.write("Train and test data prepared.")

    # Model selection and training
    st.write("### Model Training")
    problem_type = st.radio("Choose the type of problem:", ["Classification", "Regression"])

    if problem_type == "Classification":
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        # Performance metrics
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average='weighted')
        st.write(f"Accuracy: {accuracy:.2f}")
        st.write(f"F1 Score: {f1:.2f}")

    else:
        model = RandomForestRegressor(random_state=42)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        # Performance metrics
        mae = mean_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        st.write(f"Mean Absolute Error: {mae:.2f}")
        st.write(f"Root Mean Squared Error: {rmse:.2f}")
        st.write(f"R2 Score: {r2:.2f}")

    # Save the model
    st.write("### Save Trained Model")
    if st.button("Download Model"):
        model_filename = "trained_model.pkl"
        with open(model_filename, "wb") as file:
            pickle.dump(model, file)
        st.download_button(label="Download Trained Model", data=open(model_filename, "rb"), file_name=model_filename)

# Footer
st.write("---")
st.write("Developed by Dhanush. Empowering data scientists worldwide!")