Dhanush7080's picture
Rename automl_app.py to app.py
a84089e verified
Raw
History Blame Contribute Delete
3.99 kB
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import os
import pickle
# App title
st.title("Advanced Data Science Workflow with Auto ML πŸš€πŸ“Š")
st.write("Automate your data science workflow with feature engineering, model training, and deployment!")
# File upload
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file:
st.write("Debug: File uploaded successfully!")
# Load the dataset
df = pd.read_csv(uploaded_file)
st.write("### Data Preview")
st.dataframe(df.head())
# Data profiling
st.write("### Dataset Summary")
st.write(df.describe())
# Missing values
st.write("### Missing Value Analysis")
st.write(df.isnull().sum())
# Feature engineering
st.write("### Feature Engineering")
st.write("Choose the target column for model training:")
target_column = st.selectbox("Target Column", df.columns)
# Handle categorical data
st.write("### Categorical Encoding")
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
st.write(f"Encoding column: {col}")
encoder = LabelEncoder()
df[col] = encoder.fit_transform(df[col].astype(str))
# Handle missing values
st.write("### Handle Missing Values")
df = df.fillna(df.median())
st.write("Missing values filled with column medians.")
# Column selection
st.write("### Select Features")
selected_columns = st.multiselect("Select columns to keep for training", options=df.columns, default=df.columns)
df = df[selected_columns]
st.write("Updated dataset:")
st.dataframe(df.head())
# Split dataset into train and test
st.write("### Train-Test Split")
test_size = st.slider("Select test size (in %)", min_value=10, max_value=50, value=20, step=5)
test_size = test_size / 100
X = df.drop(columns=[target_column])
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
st.write("Train and test data prepared.")
# Model selection and training
st.write("### Model Training")
problem_type = st.radio("Choose the type of problem:", ["Classification", "Regression"])
if problem_type == "Classification":
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Performance metrics
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')
st.write(f"Accuracy: {accuracy:.2f}")
st.write(f"F1 Score: {f1:.2f}")
else:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Performance metrics
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)
st.write(f"Mean Absolute Error: {mae:.2f}")
st.write(f"Root Mean Squared Error: {rmse:.2f}")
st.write(f"R2 Score: {r2:.2f}")
# Save the model
st.write("### Save Trained Model")
if st.button("Download Model"):
model_filename = "trained_model.pkl"
with open(model_filename, "wb") as file:
pickle.dump(model, file)
st.download_button(label="Download Trained Model", data=open(model_filename, "rb"), file_name=model_filename)
# Footer
st.write("---")
st.write("Developed by Dhanush. Empowering data scientists worldwide!")