import streamlit as st
import pandas as pd
import io
import numpy as np
import base64
from sklearn.impute import SimpleImputer
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
st.set_page_config(page_title="CSV Data Cleaning Tool")
hide_streamlit_style = """
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.title("CSV Data Tool")
st.markdown("กดเลือกหัว Tool ข้อที่ต้องการจะใช้ได้เลยนะจ๊ะ")
uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True)
dataframes = []
if uploaded_files:
for file in uploaded_files:
file.seek(0)
df = pd.read_csv(file)
dataframes.append(df)
st.markdown("---")
st.markdown("Data Cleansing")
st.markdown("---")
duplicate_columns = st.checkbox("Remove duplicate columns", value=False)
if duplicate_columns :
for i, df in enumerate(dataframes):
dataframes[i] = df.drop_duplicates(inplace=False)
remove_empty_rows = st.checkbox("Remove empty rows", value=False)
if remove_empty_rows:
for i, df in enumerate(dataframes):
dataframes[i] = df.dropna(how="all", inplace=False)
impute_mean = st.checkbox("Impute missing values with mean (for int and float columns)",value=False)
if impute_mean:
for i, df in enumerate(dataframes):
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
dataframes[i] = df
impute_most_frequent = st.checkbox("Impute missing values with most frequent category (for categorical columns)",value=False)
if impute_most_frequent:
for i, df in enumerate(dataframes):
categorical_cols = df.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer.fit_transform(df[categorical_cols])
dataframes[i] = df
selected_out = st.selectbox("เลือก columns ที่จะดู Outlier", df.columns)
if selected_out:
z_scores = np.abs((df[selected_out] - df[selected_out].mean()) / df[selected_out].std())
threshold = 3
outliers = df[z_scores > threshold]
st.write("Outliers:")
st.write(outliers[selected_out])
for i, df in enumerate(dataframes):
st.dataframe(df)
st.markdown("---")
st.markdown("Data transform")
st.markdown("---")
selected_values = st.multiselect("เลือกค่าจากคอลัมน์", df.columns)
convert_to_String = st.checkbox("convert columns to String", value=False)
convert_to_float = st.checkbox("convert columns to Float", value=False)
if convert_to_String:
df[selected_values] = df[selected_values].astype(str)
if convert_to_float:
df[selected_values] = df[selected_values].astype(float)
onehot = st.selectbox("เลือก columns ที่จะ Encoder", df.columns)
if onehot:
df = pd.get_dummies(df, columns=[onehot])
st.markdown("---")
st.markdown("Distribution")
st.markdown("---")
norm = st.multiselect("เลือก columns ที่จะ Scale Data โดยการใช้ Mapping", df.columns)
if norm:
df[norm] = df[norm].apply(lambda x: np.log(x))
qq = st.selectbox("QQplot", df.columns)
if qq:
grouped = df.groupby('team')[[qq]].mean()
fig, ax = plt.subplots()
stats.probplot(grouped[qq], dist="norm", plot=ax)
st.pyplot(fig)
show_dataframes = st.checkbox("Show DataFrames", value=True)
if show_dataframes:
for i, df in enumerate(dataframes):
st.write(f"DataFrame {i + 1}")
st.dataframe(df)
if st.button("Download cleaned data"):
for i, df in enumerate(dataframes):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'Download cleaned_data_{i + 1}.csv'
st.markdown(href, unsafe_allow_html=True)
st.markdown("")
st.markdown("---")
st.markdown("")