import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score from sklearn.preprocessing import LabelEncoder import os import pickle # App title st.title("Advanced Data Science Workflow with Auto ML 🚀📊") st.write("Automate your data science workflow with feature engineering, model training, and deployment!") # File upload uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file: st.write("Debug: File uploaded successfully!") # Load the dataset df = pd.read_csv(uploaded_file) st.write("### Data Preview") st.dataframe(df.head()) # Data profiling st.write("### Dataset Summary") st.write(df.describe()) # Missing values st.write("### Missing Value Analysis") st.write(df.isnull().sum()) # Feature engineering st.write("### Feature Engineering") st.write("Choose the target column for model training:") target_column = st.selectbox("Target Column", df.columns) # Handle categorical data st.write("### Categorical Encoding") categorical_columns = df.select_dtypes(include=['object']).columns for col in categorical_columns: st.write(f"Encoding column: {col}") encoder = LabelEncoder() df[col] = encoder.fit_transform(df[col].astype(str)) # Handle missing values st.write("### Handle Missing Values") df = df.fillna(df.median()) st.write("Missing values filled with column medians.") # Column selection st.write("### Select Features") selected_columns = st.multiselect("Select columns to keep for training", options=df.columns, default=df.columns) df = df[selected_columns] st.write("Updated dataset:") st.dataframe(df.head()) # Split dataset into train and test st.write("### Train-Test Split") test_size = st.slider("Select test size (in %)", min_value=10, max_value=50, value=20, step=5) test_size = test_size / 100 X = df.drop(columns=[target_column]) y = df[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) st.write("Train and test data prepared.") # Model selection and training st.write("### Model Training") problem_type = st.radio("Choose the type of problem:", ["Classification", "Regression"]) if problem_type == "Classification": model = RandomForestClassifier(random_state=42) model.fit(X_train, y_train) predictions = model.predict(X_test) # Performance metrics accuracy = accuracy_score(y_test, predictions) f1 = f1_score(y_test, predictions, average='weighted') st.write(f"Accuracy: {accuracy:.2f}") st.write(f"F1 Score: {f1:.2f}") else: model = RandomForestRegressor(random_state=42) model.fit(X_train, y_train) predictions = model.predict(X_test) # Performance metrics mae = mean_absolute_error(y_test, predictions) rmse = np.sqrt(mean_squared_error(y_test, predictions)) r2 = r2_score(y_test, predictions) st.write(f"Mean Absolute Error: {mae:.2f}") st.write(f"Root Mean Squared Error: {rmse:.2f}") st.write(f"R2 Score: {r2:.2f}") # Save the model st.write("### Save Trained Model") if st.button("Download Model"): model_filename = "trained_model.pkl" with open(model_filename, "wb") as file: pickle.dump(model, file) st.download_button(label="Download Trained Model", data=open(model_filename, "rb"), file_name=model_filename) # Footer st.write("---") st.write("Developed by Dhanush. Empowering data scientists worldwide!")