import streamlit as st import pandas as pd import io import numpy as np import base64 from sklearn.impute import SimpleImputer import scipy.stats as stats import statsmodels.api as sm import matplotlib.pyplot as plt st.set_page_config(page_title="CSV Data Cleaning Tool") hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) st.title("CSV Data Tool") st.markdown("กดเลือกหัว Tool ข้อที่ต้องการจะใช้ได้เลยนะจ๊ะ") uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True) dataframes = [] if uploaded_files: for file in uploaded_files: file.seek(0) df = pd.read_csv(file) dataframes.append(df) st.markdown("---") st.markdown("Data Cleansing") st.markdown("---") duplicate_columns = st.checkbox("Remove duplicate columns", value=False) if duplicate_columns : for i, df in enumerate(dataframes): dataframes[i] = df.drop_duplicates(inplace=False) remove_empty_rows = st.checkbox("Remove empty rows", value=False) if remove_empty_rows: for i, df in enumerate(dataframes): dataframes[i] = df.dropna(how="all", inplace=False) impute_mean = st.checkbox("Impute missing values with mean (for int and float columns)",value=False) if impute_mean: for i, df in enumerate(dataframes): numeric_cols = df.select_dtypes(include=['int', 'float']).columns imputer = SimpleImputer(strategy='mean') df[numeric_cols] = imputer.fit_transform(df[numeric_cols]) dataframes[i] = df impute_most_frequent = st.checkbox("Impute missing values with most frequent category (for categorical columns)",value=False) if impute_most_frequent: for i, df in enumerate(dataframes): categorical_cols = df.select_dtypes(include=['object']).columns imputer = SimpleImputer(strategy='most_frequent') df[categorical_cols] = imputer.fit_transform(df[categorical_cols]) dataframes[i] = df selected_out = st.selectbox("เลือก columns ที่จะดู Outlier", df.columns) if selected_out: z_scores = np.abs((df[selected_out] - df[selected_out].mean()) / df[selected_out].std()) threshold = 3 outliers = df[z_scores > threshold] st.write("Outliers:") st.write(outliers[selected_out]) for i, df in enumerate(dataframes): st.dataframe(df) st.markdown("---") st.markdown("Data transform") st.markdown("---") selected_values = st.multiselect("เลือกค่าจากคอลัมน์", df.columns) convert_to_String = st.checkbox("convert columns to String", value=False) convert_to_float = st.checkbox("convert columns to Float", value=False) if convert_to_String: df[selected_values] = df[selected_values].astype(str) if convert_to_float: df[selected_values] = df[selected_values].astype(float) onehot = st.selectbox("เลือก columns ที่จะ Encoder", df.columns) if onehot: df = pd.get_dummies(df, columns=[onehot]) st.markdown("---") st.markdown("Distribution") st.markdown("---") norm = st.multiselect("เลือก columns ที่จะ Scale Data โดยการใช้ Mapping", df.columns) if norm: df[norm] = df[norm].apply(lambda x: np.log(x)) qq = st.selectbox("QQplot", df.columns) if qq: grouped = df.groupby('team')[[qq]].mean() fig, ax = plt.subplots() stats.probplot(grouped[qq], dist="norm", plot=ax) st.pyplot(fig) show_dataframes = st.checkbox("Show DataFrames", value=True) if show_dataframes: for i, df in enumerate(dataframes): st.write(f"DataFrame {i + 1}") st.dataframe(df) if st.button("Download cleaned data"): for i, df in enumerate(dataframes): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download cleaned_data_{i + 1}.csv' st.markdown(href, unsafe_allow_html=True) st.markdown("") st.markdown("---") st.markdown("")