import streamlit as st import pandas as pd import io import base64 from sklearn.impute import SimpleImputer st.set_page_config(page_title="CSV Data Cleaning Tool") hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) st.title("CSV Data Tool") st.markdown("กดเลือกหัว Tool ข้อที่ต้องการจะใช้ได้เลยนะจ๊ะ") uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True) dataframes = [] if uploaded_files: for file in uploaded_files: file.seek(0) df = pd.read_csv(file) dataframes.append(df) st.markdown("Data Cleansing") duplicate_columns = st.checkbox("Remove duplicate columns", value=False) if duplicate_columns : for i, df in enumerate(dataframes): dataframes[i] = df.drop_duplicates(inplace=False) remove_empty_rows = st.checkbox("Remove empty rows", value=False) if remove_empty_rows: for i, df in enumerate(dataframes): dataframes[i] = df.dropna(how="all", inplace=False) impute_mean = st.checkbox("Impute missing values with mean (for int and float columns)",value=False) if impute_mean: for i, df in enumerate(dataframes): numeric_cols = df.select_dtypes(include=['int', 'float']).columns imputer = SimpleImputer(strategy='mean') df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) dataframes[i] = df impute_most_frequent = st.checkbox("Impute missing values with most frequent category (for categorical columns)",value=False) if impute_most_frequent: for i, df in enumerate(dataframes): categorical_cols = df.select_dtypes(include=['object']).columns imputer = SimpleImputer(strategy='most_frequent') df[categorical_cols] = imputer.fit_transform(df[categorical_cols]) dataframes[i] = df selected_out = st.selectbox("เลือก columns ที่จะดู Outlier", df.columns) if selected_out: col = selected_out st.write(f"คอลัมน์ {col}:") # Calculate Z-Scores for the selected column z_scores = np.abs((df[col] - df[col].mean()) / df[col].std()) # Set a threshold for identifying outliers (e.g., z_score > 3) threshold = 3 # Identify outliers outliers = df[z_scores > threshold] st.write("Outliers:") st.write(outliers) st.markdown("Data transform") for i, df in enumerate(dataframes): st.dataframe(df) selected_values = st.multiselect("เลือกค่าจากคอลัมน์", df.columns) convert_to_String = st.checkbox("convert columns to String", value=False) convert_to_float = st.checkbox("convert columns to Float", value=False) if convert_to_String: df[selected_values] = df[selected_values].astype(str) if convert_to_float: df[selected_values] = df[selected_values].astype(float) show_dataframes = st.checkbox("Show DataFrames", value=True) if show_dataframes: for i, df in enumerate(dataframes): st.write(f"DataFrame {i + 1}") st.dataframe(df) if st.button("Download cleaned data"): for i, df in enumerate(dataframes): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download cleaned_data_{i + 1}.csv' st.markdown(href, unsafe_allow_html=True) st.markdown("") st.markdown("---") st.markdown("")