Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from file_manage import list_files | |
| import os | |
| from sklearn.preprocessing import LabelEncoder | |
| import warnings | |
| import json | |
| warnings.filterwarnings("ignore") | |
| def preprocess_data(): | |
| st.title('Data Preprocessing') | |
| csv_files = list_files() | |
| if not csv_files: | |
| st.warning("No CSV files available for preprocessing.") | |
| else: | |
| selected_file = st.selectbox('Select a CSV file for Preprocessing', csv_files) | |
| if selected_file: | |
| df = pd.read_csv(os.path.join('uploads', selected_file)) | |
| st.write('**Data Preview:**') | |
| st.dataframe(df.head()) | |
| # Step 1: Show unique values before encoding | |
| if st.button('Show Unique Values'): | |
| st.subheader("Unique Values in Each Column") | |
| for col in df.columns: | |
| st.write(f"**{col}** ({df[col].dtype}) → {df[col].nunique()} unique values") | |
| st.write(df[col].unique()) | |
| # Step 2: Analyze data | |
| if st.button('Analyze Data'): | |
| null_counts = df.isnull().sum() | |
| null_info = pd.DataFrame({'Column': null_counts.index, 'Null Values': null_counts.values}) | |
| st.subheader("Null Values Information") | |
| st.dataframe(null_info) | |
| categorical_data = df.select_dtypes(include=['object']).columns.tolist() | |
| st.subheader("Categorical Columns") | |
| st.write(categorical_data if categorical_data else "No categorical columns found.") | |
| # Step 3: Preprocess | |
| if st.button('Preprocess Data'): | |
| labelencoder_mappings = {} # save the encoded mappings | |
| for col in df.columns: | |
| # Fill missing values | |
| if df[col].isnull().sum() > 0: | |
| df[col].fillna(df[col].mode()[0], inplace=True) | |
| # Encode categorical columns | |
| if df[col].dtype == 'object': | |
| le = LabelEncoder() | |
| df[col] = le.fit_transform(df[col]) | |
| mapping_dict = {str(k): int(v) for k, v in zip(le.classes_, le.transform(le.classes_))} | |
| labelencoder_mappings[col] = mapping_dict | |
| # Show encoding mapping in Streamlit | |
| st.write(f"**Encoding for column `{col}`:**") | |
| st.json(mapping_dict) | |
| # Save preprocessed CSV | |
| preprocessed_folder = 'preprocessed_data' | |
| os.makedirs(preprocessed_folder, exist_ok=True) | |
| preprocessed_filename = selected_file | |
| df.to_csv(os.path.join(preprocessed_folder, preprocessed_filename), index=False) | |
| st.success(f"Preprocessed file saved as {preprocessed_filename}") | |
| # Save label encoder mappings | |
| mapping_folder = "mapping" | |
| os.makedirs(mapping_folder, exist_ok=True) | |
| mapping_file = os.path.join(mapping_folder, f"{selected_file.split('.')[0]}.json") | |
| with open(mapping_file, "w") as f: | |
| json.dump(labelencoder_mappings, f, indent=4) | |
| st.success(f"Label encoding mappings saved in {mapping_file}") | |
| # Show final preprocessed data | |
| st.subheader("Preprocessed Data") | |
| st.dataframe(df) | |
| st.success("Data preprocessing completed successfully!") |