No_code_ML_builder / src /preprocessing.py
Shubham 10000
update to main file
cb74654
import streamlit as st
import pandas as pd
from file_manage import list_files
import os
from sklearn.preprocessing import LabelEncoder
import warnings
import json
warnings.filterwarnings("ignore")
def preprocess_data():
st.title('Data Preprocessing')
csv_files = list_files()
if not csv_files:
st.warning("No CSV files available for preprocessing.")
else:
selected_file = st.selectbox('Select a CSV file for Preprocessing', csv_files)
if selected_file:
df = pd.read_csv(os.path.join('uploads', selected_file))
st.write('**Data Preview:**')
st.dataframe(df.head())
# Step 1: Show unique values before encoding
if st.button('Show Unique Values'):
st.subheader("Unique Values in Each Column")
for col in df.columns:
st.write(f"**{col}** ({df[col].dtype}) → {df[col].nunique()} unique values")
st.write(df[col].unique())
# Step 2: Analyze data
if st.button('Analyze Data'):
null_counts = df.isnull().sum()
null_info = pd.DataFrame({'Column': null_counts.index, 'Null Values': null_counts.values})
st.subheader("Null Values Information")
st.dataframe(null_info)
categorical_data = df.select_dtypes(include=['object']).columns.tolist()
st.subheader("Categorical Columns")
st.write(categorical_data if categorical_data else "No categorical columns found.")
# Step 3: Preprocess
if st.button('Preprocess Data'):
labelencoder_mappings = {} # save the encoded mappings
for col in df.columns:
# Fill missing values
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].mode()[0], inplace=True)
# Encode categorical columns
if df[col].dtype == 'object':
le = LabelEncoder()
df[col] = le.fit_transform(df[col])
mapping_dict = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}
labelencoder_mappings[col] = mapping_dict
# Show encoding mapping in Streamlit
st.write(f"**Encoding for column `{col}`:**")
st.json(mapping_dict)
# Save preprocessed CSV
preprocessed_folder = 'preprocessed_data'
os.makedirs(preprocessed_folder, exist_ok=True)
preprocessed_filename = selected_file
df.to_csv(os.path.join(preprocessed_folder, preprocessed_filename), index=False)
st.success(f"Preprocessed file saved as {preprocessed_filename}")
# Save label encoder mappings
mapping_folder = "mapping"
os.makedirs(mapping_folder, exist_ok=True)
mapping_file = os.path.join(mapping_folder, f"{selected_file.split('.')[0]}.json")
with open(mapping_file, "w") as f:
json.dump(labelencoder_mappings, f, indent=4)
st.success(f"Label encoding mappings saved in {mapping_file}")
# Show final preprocessed data
st.subheader("Preprocessed Data")
st.dataframe(df)
st.success("Data preprocessing completed successfully!")