File size: 3,526 Bytes
cb74654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import streamlit as st
import pandas as pd
from file_manage import list_files
import os
from sklearn.preprocessing import LabelEncoder
import warnings
import json

warnings.filterwarnings("ignore")

def preprocess_data():
    st.title('Data Preprocessing')

    csv_files = list_files()

    if not csv_files:
        st.warning("No CSV files available for preprocessing.")
    else:
        selected_file = st.selectbox('Select a CSV file for Preprocessing', csv_files)
        if selected_file:
            df = pd.read_csv(os.path.join('uploads', selected_file))
            st.write('**Data Preview:**')
            st.dataframe(df.head())

            # Step 1: Show unique values before encoding
            if st.button('Show Unique Values'):
                st.subheader("Unique Values in Each Column")
                for col in df.columns:
                    st.write(f"**{col}** ({df[col].dtype}) → {df[col].nunique()} unique values")
                    st.write(df[col].unique())

            # Step 2: Analyze data
            if st.button('Analyze Data'):
                null_counts = df.isnull().sum()
                null_info = pd.DataFrame({'Column': null_counts.index, 'Null Values': null_counts.values})
                st.subheader("Null Values Information")
                st.dataframe(null_info)

                categorical_data = df.select_dtypes(include=['object']).columns.tolist()
                st.subheader("Categorical Columns")
                st.write(categorical_data if categorical_data else "No categorical columns found.")

            # Step 3: Preprocess
            if st.button('Preprocess Data'):
                labelencoder_mappings = {}  # save the encoded mappings

                for col in df.columns:
                    # Fill missing values
                    if df[col].isnull().sum() > 0:
                        df[col].fillna(df[col].mode()[0], inplace=True)

                    # Encode categorical columns
                    if df[col].dtype == 'object':
                        le = LabelEncoder()
                        df[col] = le.fit_transform(df[col])
                        mapping_dict = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))}
                        labelencoder_mappings[col] = mapping_dict

                        # Show encoding mapping in Streamlit
                        st.write(f"**Encoding for column `{col}`:**")
                        st.json(mapping_dict)

                # Save preprocessed CSV
                preprocessed_folder = 'preprocessed_data'
                os.makedirs(preprocessed_folder, exist_ok=True)
                preprocessed_filename = selected_file
                df.to_csv(os.path.join(preprocessed_folder, preprocessed_filename), index=False)
                st.success(f"Preprocessed file saved as {preprocessed_filename}")

                # Save label encoder mappings
                mapping_folder = "mapping"
                os.makedirs(mapping_folder, exist_ok=True)
                mapping_file = os.path.join(mapping_folder, f"{selected_file.split('.')[0]}.json")
                with open(mapping_file, "w") as f:
                    json.dump(labelencoder_mappings, f, indent=4)
                st.success(f"Label encoding mappings saved in {mapping_file}")

                # Show final preprocessed data
                st.subheader("Preprocessed Data")
                st.dataframe(df)
                st.success("Data preprocessing completed successfully!")